diff --git a/app.py b/app.py
index 76be6bb5fa9103091688d197be6c5c55f8dd64a7..d7ea4bcfdebce4f6dabf2b095392c24d8cddd30e 100644
--- a/app.py
+++ b/app.py
@@ -55,7 +55,8 @@ with gr.Blocks() as block:
)
# Define different captions for each table
- default_caption = "**Table 1: MEGA-Bench full results.** The number in the parentheses is the number of tasks of each keyword.
The Core set contains $N_{\\text{core}} = 440$ tasks evaluated by rule-based metrics, and the Open-ended set contains $N_{\\text{open}} = 65$ tasks evaluated by a VLM judge (we use GPT-4o-0806).
$\\text{Overall} \\ = \\ \\frac{\\max(\\text{Core w/o CoT}, \\ \\text{Core w/ CoT}) \\ \\cdot \\ N_{\\text{core}} \\ + \\ \\text{Open-ended} \\ \\cdot \\ N_{\\text{open}}}{N_{\\text{core}} \\ + \\ N_{\\text{open}}}$"
+ default_caption = "**Table 1: MEGA-Bench full results.** The number in the parentheses is the number of tasks of each keyword.
The Core set contains $N_{\\text{core}} = 440$ tasks evaluated by rule-based metrics, and the Open-ended set contains $N_{\\text{open}} = 65$ tasks evaluated by a VLM judge (we use GPT-4o-0806).
Different from the results in our paper, we only use the Core results with CoT prompting here for clarity and compatibility with the released data.
$\\text{Overall} \\ = \\ \\frac{\\text{Core} \\ \\cdot \\ N_{\\text{core}} \\ + \\ \\text{Open-ended} \\ \\cdot \\ N_{\\text{open}}}{N_{\\text{core}} \\ + \\ N_{\\text{open}}}$ "
+
single_image_caption = "**Table 2: MEGA-Bench Single-image setting results.** The number in the parentheses is the number of tasks in each keyword.
This subset contains 273 single-image tasks from the Core set and 42 single-image tasks from the Open-ended set. For open-source models, we drop the image input in the 1-shot demonstration example so that the entire query contains a single image only.
Compared to the default table, some models with only single-image support are added."
caption_component = gr.Markdown(
diff --git a/constants.py b/constants.py
index 5ede1ed201bf80874ccae05283aa8832056c6e10..d6e2eb11c7d02f59ef39d5e3be9127709089b65f 100644
--- a/constants.py
+++ b/constants.py
@@ -28,7 +28,7 @@ We aim to provide cost-effective and accurate evaluation for multimodal models,
## 📊🔍 Results & Takeaways from Evaluating Top Models
-- GPT-4o (0513) and Claude 3.5 Sonnet (1022) lead the benchmark. Claude 3.5 Sonnet (1022) improves over Claude 3.5 Sonnet (0622) obviously in planning tasks (application dimension) and UI/Infographics inputs (input format dimension).
+- GPT-4o (0513) and Claude 3.5 Sonnet (1022) lead the benchmark. Claude 3.5 Sonnet (1022) improves over Claude 3.5 Sonnet (0620) obviously in planning tasks (application dimension) and UI/Infographics inputs (input format dimension).
- Qwen2-VL stands out among open-source models, and its flagship model gets close to some proprietary flagship models
- Chain-of-Thought (CoT) prompting improves proprietary models but has limited impact on open-source models
- Gemini 1.5 Flash performs the best among all the evaluated efficiency models, but struggles with UI and document tasks
diff --git a/static/eval_results/Default/Aquila_VL_2B/summary_results.json b/static/eval_results/Default/Aquila_VL_2B/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..8aaeeec492de6dabf76847d0cb433cab957a2f9d
--- /dev/null
+++ b/static/eval_results/Default/Aquila_VL_2B/summary_results.json
@@ -0,0 +1,251 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 440,
+ "num_eval_samples": 6539,
+ "macro_mean_score": 0.159970161379836,
+ "micro_mean_score": 0.15844711671722148
+ },
+ "open": {
+ "num_eval_tasks": 65,
+ "num_eval_samples": 1163,
+ "macro_mean_score": 0.24567572098570653,
+ "micro_mean_score": 0.2704213241616509
+ },
+ "overall_score": 0.17100157004197775
+ },
+ "keyword_stats": {
+ "skills": {
+ "Object Recognition and Classification": {
+ "count": 303,
+ "num_samples": 4755,
+ "tasks": [],
+ "average_score": 0.1796551584774396
+ },
+ "Text Recognition (OCR)": {
+ "count": 137,
+ "num_samples": 2239,
+ "tasks": [],
+ "average_score": 0.1263506560912463
+ },
+ "Language Understanding and Generation": {
+ "count": 154,
+ "num_samples": 2509,
+ "tasks": [],
+ "average_score": 0.1775085349123463
+ },
+ "Scene and Event Understanding": {
+ "count": 154,
+ "num_samples": 2467,
+ "tasks": [],
+ "average_score": 0.2114933522881099
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 109,
+ "num_samples": 1910,
+ "tasks": [],
+ "average_score": 0.16251700109869488
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 51,
+ "num_samples": 855,
+ "tasks": [],
+ "average_score": 0.26453155444796583
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 15,
+ "num_samples": 245,
+ "tasks": [],
+ "average_score": 0.3729498746867168
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 77,
+ "num_samples": 1386,
+ "tasks": [],
+ "average_score": 0.19090788408036002
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 152,
+ "num_samples": 2437,
+ "tasks": [],
+ "average_score": 0.16500679466160564
+ },
+ "Planning and Decision Making": {
+ "count": 37,
+ "num_samples": 577,
+ "tasks": [],
+ "average_score": 0.03972686819521137
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 93,
+ "num_samples": 1517,
+ "tasks": [],
+ "average_score": 0.07035116566014021
+ },
+ "Text-Based Images and Documents": {
+ "count": 82,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.11915109312705179
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 101,
+ "num_samples": 1718,
+ "tasks": [],
+ "average_score": 0.18915652635850314
+ },
+ "Videos": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.21939978337316163
+ },
+ "Artistic and Creative Content": {
+ "count": 32,
+ "num_samples": 541,
+ "tasks": [],
+ "average_score": 0.17643260913333875
+ },
+ "Photographs": {
+ "count": 143,
+ "num_samples": 2248,
+ "tasks": [],
+ "average_score": 0.2438396314831894
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 11,
+ "num_samples": 169,
+ "tasks": [],
+ "average_score": 0.08989401697906672
+ }
+ },
+ "output_format": {
+ "contextual_formatted_text": {
+ "count": 98,
+ "num_samples": 1514,
+ "tasks": [],
+ "average_score": 0.12241197113963243
+ },
+ "structured_output": {
+ "count": 110,
+ "num_samples": 1714,
+ "tasks": [],
+ "average_score": 0.10758402844431432
+ },
+ "exact_text": {
+ "count": 83,
+ "num_samples": 1278,
+ "tasks": [],
+ "average_score": 0.19372082302321905
+ },
+ "numerical_data": {
+ "count": 49,
+ "num_samples": 862,
+ "tasks": [],
+ "average_score": 0.19201243810115767
+ },
+ "open_ended_output": {
+ "count": 80,
+ "num_samples": 1454,
+ "tasks": [],
+ "average_score": 0.23278612647548963
+ },
+ "multiple_choice": {
+ "count": 85,
+ "num_samples": 1363,
+ "tasks": [],
+ "average_score": 0.21664527852608348
+ }
+ },
+ "input_num": {
+ "6-8 images": {
+ "count": 21,
+ "num_samples": 314,
+ "tasks": [],
+ "average_score": 0.12138133030990172
+ },
+ "9-image or more": {
+ "count": 41,
+ "num_samples": 623,
+ "tasks": [],
+ "average_score": 0.01221681479628382
+ },
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.17994400163273605
+ },
+ "video": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.21939978337316163
+ },
+ "4-5 images": {
+ "count": 34,
+ "num_samples": 520,
+ "tasks": [],
+ "average_score": 0.18212149746318507
+ },
+ "2-3 images": {
+ "count": 51,
+ "num_samples": 802,
+ "tasks": [],
+ "average_score": 0.21563163558700174
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 72,
+ "num_samples": 1124,
+ "tasks": [],
+ "average_score": 0.0981320856519089
+ },
+ "Planning": {
+ "count": 78,
+ "num_samples": 1239,
+ "tasks": [],
+ "average_score": 0.0557399538308785
+ },
+ "Coding": {
+ "count": 31,
+ "num_samples": 474,
+ "tasks": [],
+ "average_score": 0.1351126472094214
+ },
+ "Perception": {
+ "count": 145,
+ "num_samples": 2313,
+ "tasks": [],
+ "average_score": 0.2025034827431662
+ },
+ "Metrics": {
+ "count": 20,
+ "num_samples": 309,
+ "tasks": [],
+ "average_score": 0.29326275059361956
+ },
+ "Science": {
+ "count": 29,
+ "num_samples": 574,
+ "tasks": [],
+ "average_score": 0.22529225586731416
+ },
+ "Knowledge": {
+ "count": 97,
+ "num_samples": 1605,
+ "tasks": [],
+ "average_score": 0.23810497886903373
+ },
+ "Mathematics": {
+ "count": 33,
+ "num_samples": 547,
+ "tasks": [],
+ "average_score": 0.17867138975396438
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/Default/Aquila_VL_2B/task_results.json b/static/eval_results/Default/Aquila_VL_2B/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c82047a713cf36db47e26bd58bec56a58a4ce85
--- /dev/null
+++ b/static/eval_results/Default/Aquila_VL_2B/task_results.json
@@ -0,0 +1,7756 @@
+[
+ {
+ "name": "functionality_matching_in_different_objects",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_rating",
+ "score": 0.7138859642533433,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.04081632653061224,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "recipe_image_ordering",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "rocks_samples_identify",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "sign_language",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "2d_image_jigsaw_puzzle_easy",
+ "score": 0.20357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "booking_web_recommendation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "painting_QA",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.17647058823529413,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_dynamic_pref",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_keywords_based_retrieval_non_radiology",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.061224489795918366,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_hindi_next_step",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.5106271997072349,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_temporal",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_rating",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_eval_visual_pref",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_parasite_detection",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.2380952380952381,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_visualization_output_understanding",
+ "score": 0.1,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 10,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.3,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.020071738122614136,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiview_reasoning_camera_moving",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_action_recognition",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_author",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "rocks_samples_compare",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "comic_page_ordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.65,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_news_qa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "paper_review_acceptance",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "counting_multi_image",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "photoshop_operation",
+ "score": 0.08928571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "vln_identify_robot",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_tegulu_next_step",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "medical_abdomen_endscopy_organ_recognition",
+ "score": 0.023809523809523808,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.07352941176470588,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_direction_understanding",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "semantic_matching_of_two_images",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.0707070707070707,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.2571428571428572,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pokemon_3D_recognition",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "visual_correspondance_in_two_images",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video_eval_factual_pref",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_Python",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_event",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_name",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_content_based_retrieval_radiology",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_segments_reordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rebus",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "worldle",
+ "score": 2.0145220870414344e-06,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Movie_retrieval_by_actor",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "music_sheet_sentiment",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "code_retrieval",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_intent_recognition",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.018365235176046272,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ancient_map_understanding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "media_recommend_solutions_stackoverflow",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "vln_english_next_step",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "waldo",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_polyp_segmentation_single_object_rater",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_match_problem",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_solution_compare",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_spatial",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "logical_reasoning_2d_folding",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.21776661396440047,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_reasoning",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "TV_show_retrieval_by_character",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_location",
+ "score": 0.0606060606060606,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "planning_visual_grippers",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_abdomen_MRI_organ_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_sign_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "Forensic_Detection_of_different_images",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.10526315789473684,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_2D_views_of_3D_shapes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "dish_ingredient_match",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "media_homepage_profile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_camera_motion_description",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "video"
+ },
+ {
+ "name": "google_streetview_line_sorting",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "move_pos_to_pos_hanoi_4_pole",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_line_reasoning",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_view",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_retrieval_given_surgeon_activity",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Videos",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_basic",
+ "score": 0.23529411764705882,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_add_tag",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.4600736842105264,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.07586052215812712,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.07539682539682542,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_view_identification",
+ "score": 0.08888888888888889,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.848854419078294,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_vehicle_multiview_counting",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_video_moving_object_property_recognition",
+ "score": 0.8125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "clevrer_video_moving_object_count",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 21,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "sta_action_localization_video",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "code_programming_test_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "graph_hamiltonian_cycle",
+ "score": 0.33035714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vizwiz_quality_accessment_for_blind",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.08928571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_check_two_images",
+ "score": 0.5111111111111112,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "dvqa",
+ "score": 0.42105263157894735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cheapest_flight_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "code_programming_extremely_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.2665237458765274,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_load_type_prediction_from_plot",
+ "score": 0.4523809523809523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_multicamera_tracking_predict_bbox",
+ "score": 0.034222739980969856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "perception_test_video_character_order",
+ "score": 0.3125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.21428571428571422,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_mc",
+ "score": 0.8947368421052632,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.1700714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "algebra",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_sequence",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_similar_scene_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "star_object_interaction_video",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.3157894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.18333333333333332,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_moving_direction_video",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "IAM_line_ocr_and_locate",
+ "score": 0.13601920423828534,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.09999999999999996,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_object_shuffle_video",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "muma_theory_of_mind_social_goal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "muma_theory_of_mind_belief_of_goal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevr_arithmetic",
+ "score": 0.21052631578947367,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.3688571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.11224489795918366,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.08,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arc_agi",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.04401625959886561,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_to_camera_trajectory_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.10317460317460318,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.03361344537815126,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualdial_visual_dialog_image_guessing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vlnqa_egocentric_navigation_video",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_output_result",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "face_identity_matching",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "music_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "clevrer_object_existence_video",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "cam_traj_to_video_selection",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_content_reasoning",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "3d_fragments_understanding",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.6842105263157895,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.3578947368421052,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_error_line_identification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_programming_test_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logo2k_same_type_logo_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.21052631578947367,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.28888888888888886,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.11607142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.14799999999999996,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "photo_sharing_image_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "emotion_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_human_multiview_counting",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "perception_test_video_action_count",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "stock_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.30685136455043505,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_path",
+ "score": 0.4107142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "coco_ood_global_image_retrieval_by_query_property",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.4050000000000001,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_T2I_reward",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.0071428571428571435,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_motion_matching_real_3D",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "video_motion_matching_3D_real",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.047619047619047616,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.619047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_position_images",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_xor_images",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_position_only",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_images",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat_position_only_length",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6823529411764706,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_youtube",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_iphone_settings",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_notes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_word",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_alipay",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_twitter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_leetcode",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_tiktok",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_excel",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_instagram",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_ppt",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_zoom",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_3d_model_texturing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_subject",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_unmask",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_mask",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_aesthetics",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_motion_guided_editing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_control",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_semantics",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.11862244897959183,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.09268707482993196,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ball_cup_swap_3",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geographic_remote_sensing_land_cover",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visual_prediction_rater_novel_view_synthesis",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_semantic_segmentation",
+ "score": 0.020833333333333332,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_panoptic_segmentation",
+ "score": 0.023809523809523808,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_surface_normal_estimation",
+ "score": 0.2619047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_openable_part_segmentation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_plane_segmentation",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_depth_estimation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_3d_assembled_quality_understanding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.028571428571428574,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.014285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.35172413793103446,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.07857142857142856,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.19310344827586204,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.22413793103448276,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sceneqa_scene_transition_video",
+ "score": 0.17857142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "wikihow_complex_task_completion",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "funqa_unexpected_action_humor_video",
+ "score": 0.1866666666666667,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.4142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.014285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.3482758620689655,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "activitynetqa",
+ "score": 0.5157894736842106,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "generated_video_artifacts",
+ "score": 0.24375000000000002,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "paper_review_writing",
+ "score": 0.060000000000000005,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.25555555555555554,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_creative_video",
+ "score": 0.22,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.14482758620689654,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.43793103448275855,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.37931034482758613,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "doc_vqa",
+ "score": 0.48125,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "video2notes",
+ "score": 0.05714285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.20344827586206896,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_short_title",
+ "score": 0.21428571428571427,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.18666666666666673,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_oe",
+ "score": 0.3578947368421052,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.2,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_summary",
+ "score": 0.12142857142857144,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.1758620689655172,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.05714285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_content_follow_up",
+ "score": 0.03571428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.23571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_qa",
+ "score": 0.5142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.5038461538461537,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.5428571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.084,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.1642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.4548387096774193,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_magic_video",
+ "score": 0.2533333333333333,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.26896551724137935,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.37368421052631573,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_detail_description",
+ "score": 0.2157894736842105,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.14285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.0071428571428571435,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.0071428571428571435,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.028571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.12142857142857146,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.0642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.3428571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.07857142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Spanish",
+ "score": 0.06428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_French",
+ "score": 0.06428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Russian",
+ "score": 0.04285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Chinese",
+ "score": 0.09999999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Arabic",
+ "score": 0.028571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "red_teaming_visual_order_B",
+ "score": 0.836842105263158,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8421052631578949,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.5450000000000002,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.61,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.4499999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_A",
+ "score": 0.8578947368421055,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.6849999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/Default/Aria/summary_results.json b/static/eval_results/Default/Aria/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..348d2d8b5f081312e2c2629ee53791750ebf9e42
--- /dev/null
+++ b/static/eval_results/Default/Aria/summary_results.json
@@ -0,0 +1,251 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 440,
+ "num_eval_samples": 6539,
+ "macro_mean_score": 0.289073788209904,
+ "micro_mean_score": 0.2859007507765791
+ },
+ "open": {
+ "num_eval_tasks": 65,
+ "num_eval_samples": 1163,
+ "macro_mean_score": 0.5103725263180767,
+ "micro_mean_score": 0.5349957007738607
+ },
+ "overall_score": 0.31755778420402525
+ },
+ "keyword_stats": {
+ "skills": {
+ "Object Recognition and Classification": {
+ "count": 303,
+ "num_samples": 4755,
+ "tasks": [],
+ "average_score": 0.3153649050553317
+ },
+ "Text Recognition (OCR)": {
+ "count": 137,
+ "num_samples": 2239,
+ "tasks": [],
+ "average_score": 0.34425736922415495
+ },
+ "Language Understanding and Generation": {
+ "count": 154,
+ "num_samples": 2509,
+ "tasks": [],
+ "average_score": 0.3921740378709932
+ },
+ "Scene and Event Understanding": {
+ "count": 154,
+ "num_samples": 2467,
+ "tasks": [],
+ "average_score": 0.37623282710622424
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 109,
+ "num_samples": 1910,
+ "tasks": [],
+ "average_score": 0.271674311347156
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 51,
+ "num_samples": 855,
+ "tasks": [],
+ "average_score": 0.46313777834281344
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 15,
+ "num_samples": 245,
+ "tasks": [],
+ "average_score": 0.5692180451127821
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 77,
+ "num_samples": 1386,
+ "tasks": [],
+ "average_score": 0.3152064038837139
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 152,
+ "num_samples": 2437,
+ "tasks": [],
+ "average_score": 0.23851147782276536
+ },
+ "Planning and Decision Making": {
+ "count": 37,
+ "num_samples": 577,
+ "tasks": [],
+ "average_score": 0.11246568298589892
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 93,
+ "num_samples": 1517,
+ "tasks": [],
+ "average_score": 0.28561724084490353
+ },
+ "Text-Based Images and Documents": {
+ "count": 82,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.2505346698796475
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 101,
+ "num_samples": 1718,
+ "tasks": [],
+ "average_score": 0.3040414715952029
+ },
+ "Videos": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.41865640360591405
+ },
+ "Artistic and Creative Content": {
+ "count": 32,
+ "num_samples": 541,
+ "tasks": [],
+ "average_score": 0.3622713579911698
+ },
+ "Photographs": {
+ "count": 143,
+ "num_samples": 2248,
+ "tasks": [],
+ "average_score": 0.35872259826035346
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 11,
+ "num_samples": 169,
+ "tasks": [],
+ "average_score": 0.1509096092007215
+ }
+ },
+ "output_format": {
+ "contextual_formatted_text": {
+ "count": 98,
+ "num_samples": 1514,
+ "tasks": [],
+ "average_score": 0.2846987779732631
+ },
+ "structured_output": {
+ "count": 110,
+ "num_samples": 1714,
+ "tasks": [],
+ "average_score": 0.2899384042262363
+ },
+ "exact_text": {
+ "count": 83,
+ "num_samples": 1278,
+ "tasks": [],
+ "average_score": 0.27412885527802433
+ },
+ "numerical_data": {
+ "count": 49,
+ "num_samples": 862,
+ "tasks": [],
+ "average_score": 0.3117275816801635
+ },
+ "open_ended_output": {
+ "count": 80,
+ "num_samples": 1454,
+ "tasks": [],
+ "average_score": 0.4523860109667709
+ },
+ "multiple_choice": {
+ "count": 85,
+ "num_samples": 1363,
+ "tasks": [],
+ "average_score": 0.310055869988487
+ }
+ },
+ "input_num": {
+ "6-8 images": {
+ "count": 21,
+ "num_samples": 314,
+ "tasks": [],
+ "average_score": 0.18301681783824644
+ },
+ "9-image or more": {
+ "count": 41,
+ "num_samples": 623,
+ "tasks": [],
+ "average_score": 0.26651659725352617
+ },
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.34236220565522313
+ },
+ "video": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.41865640360591405
+ },
+ "4-5 images": {
+ "count": 34,
+ "num_samples": 520,
+ "tasks": [],
+ "average_score": 0.19142683154129833
+ },
+ "2-3 images": {
+ "count": 51,
+ "num_samples": 802,
+ "tasks": [],
+ "average_score": 0.2596336265133595
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 72,
+ "num_samples": 1124,
+ "tasks": [],
+ "average_score": 0.3929243812973524
+ },
+ "Planning": {
+ "count": 78,
+ "num_samples": 1239,
+ "tasks": [],
+ "average_score": 0.1403503245041943
+ },
+ "Coding": {
+ "count": 31,
+ "num_samples": 474,
+ "tasks": [],
+ "average_score": 0.25367910605102256
+ },
+ "Perception": {
+ "count": 145,
+ "num_samples": 2313,
+ "tasks": [],
+ "average_score": 0.3494812758481046
+ },
+ "Metrics": {
+ "count": 20,
+ "num_samples": 309,
+ "tasks": [],
+ "average_score": 0.3662927672998609
+ },
+ "Science": {
+ "count": 29,
+ "num_samples": 574,
+ "tasks": [],
+ "average_score": 0.28616079233761366
+ },
+ "Knowledge": {
+ "count": 97,
+ "num_samples": 1605,
+ "tasks": [],
+ "average_score": 0.3953949223279651
+ },
+ "Mathematics": {
+ "count": 33,
+ "num_samples": 547,
+ "tasks": [],
+ "average_score": 0.26097385403450996
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/Default/Aria/task_results.json b/static/eval_results/Default/Aria/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..c211356d1853516b2bf00bd3f361964f56fec5ed
--- /dev/null
+++ b/static/eval_results/Default/Aria/task_results.json
@@ -0,0 +1,7756 @@
+[
+ {
+ "name": "functionality_matching_in_different_objects",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_rating",
+ "score": 0.647700584092455,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.12244897959183673,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "recipe_image_ordering",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "rocks_samples_identify",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "sign_language",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "2d_image_jigsaw_puzzle_easy",
+ "score": 0.26071428571428573,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "booking_web_recommendation",
+ "score": 0.5515873015873015,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "painting_QA",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.17647058823529413,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_dynamic_pref",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_keywords_based_retrieval_non_radiology",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.10204081632653061,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_hindi_next_step",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.5736169347206616,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_temporal",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_rating",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_eval_visual_pref",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_parasite_detection",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.09523809523809523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_visualization_output_understanding",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 10,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.034980972645280155,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiview_reasoning_camera_moving",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_action_recognition",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_author",
+ "score": 0.3125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "rocks_samples_compare",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "comic_page_ordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.9404761904761905,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_news_qa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "paper_review_acceptance",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "counting_multi_image",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "photoshop_operation",
+ "score": 0.27976190476190477,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "vln_identify_robot",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_tegulu_next_step",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "medical_abdomen_endscopy_organ_recognition",
+ "score": 0.047619047619047616,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.3740196078431373,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_direction_understanding",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "semantic_matching_of_two_images",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.1414141414141414,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.18571428571428572,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pokemon_3D_recognition",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "visual_correspondance_in_two_images",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video_eval_factual_pref",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_Python",
+ "score": 0.22916666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_event",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_name",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_content_based_retrieval_radiology",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_segments_reordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rebus",
+ "score": 0.043478260869565216,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "worldle",
+ "score": 0.10270184425364004,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.16071428571428573,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Movie_retrieval_by_actor",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "music_sheet_sentiment",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "code_retrieval",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.35294117647058826,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_intent_recognition",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.0004322891149600856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.3016653054893176,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ancient_map_understanding",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "media_recommend_solutions_stackoverflow",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "vln_english_next_step",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "waldo",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_polyp_segmentation_single_object_rater",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_match_problem",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_solution_compare",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_spatial",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "logical_reasoning_2d_folding",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.3965511477601961,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_reasoning",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "TV_show_retrieval_by_character",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_location",
+ "score": 0.14545454545454545,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "planning_visual_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_abdomen_MRI_organ_recognition",
+ "score": 0.16666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_sign_recognition",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "Forensic_Detection_of_different_images",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.21052631578947367,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_2D_views_of_3D_shapes",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "dish_ingredient_match",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "media_homepage_profile",
+ "score": 0.023809523809523808,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_camera_motion_description",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "video"
+ },
+ {
+ "name": "google_streetview_line_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "move_pos_to_pos_hanoi_4_pole",
+ "score": 0.003968253968253968,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_line_reasoning",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_view",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_retrieval_given_surgeon_activity",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Videos",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_basic",
+ "score": 0.4117647058823529,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_add_tag",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.3275052631578947,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.5092740145201512,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.7103174603174602,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_view_identification",
+ "score": 0.05555555555555555,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.848854419078294,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_vehicle_multiview_counting",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.04242857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_video_moving_object_property_recognition",
+ "score": 0.625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "clevrer_video_moving_object_count",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 21,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "sta_action_localization_video",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "code_programming_test_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "graph_hamiltonian_cycle",
+ "score": 0.24583333333333332,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vizwiz_quality_accessment_for_blind",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.21327870239533622,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.48214285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_check_two_images",
+ "score": 0.6222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "dvqa",
+ "score": 0.7894736842105263,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cheapest_flight_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "code_programming_extremely_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_retrieval",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.33392901390622537,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_load_type_prediction_from_plot",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_multicamera_tracking_predict_bbox",
+ "score": 0.01240457703042735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "perception_test_video_character_order",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.8896103896103896,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_mc",
+ "score": 0.9473684210526315,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.6950714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "algebra",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_sequence",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_similar_scene_retrieval",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "star_object_interaction_video",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.16666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_moving_direction_video",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "IAM_line_ocr_and_locate",
+ "score": 0.38468048224583823,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.5441428571428572,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_object_shuffle_video",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "muma_theory_of_mind_social_goal",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "muma_theory_of_mind_belief_of_goal",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevr_arithmetic",
+ "score": 0.631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.8993142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.5510204081632654,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.6285714285714284,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.2916666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.66,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arc_agi",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.048240995325583465,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_to_camera_trajectory_retrieval",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.9411764705882355,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualdial_visual_dialog_image_guessing",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.8125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vlnqa_egocentric_navigation_video",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_output_result",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.4888888888888889,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "face_identity_matching",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "music_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "clevrer_object_existence_video",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "cam_traj_to_video_selection",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_content_reasoning",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "3d_fragments_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.6126284210526315,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_error_line_identification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_programming_test_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.1646958321823573,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logo2k_same_type_logo_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.2631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_prediction",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.5973333333333334,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "photo_sharing_image_retrieval",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_retrieval",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "emotion_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_human_multiview_counting",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "perception_test_video_action_count",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "stock_info_retrieval",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.5459704567502287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_path",
+ "score": 0.25704365079365077,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "coco_ood_global_image_retrieval_by_query_property",
+ "score": 0.5011904761904762,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.6962105263157895,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.23333333333333334,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_T2I_reward",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.15714285714285717,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_motion_matching_real_3D",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "video_motion_matching_3D_real",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.4523809523809523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_position_images",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_xor_images",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_position_only",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_images",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat_position_only_length",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6470588235294118,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_youtube",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_iphone_settings",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_notes",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_word",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_alipay",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_twitter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_leetcode",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_tiktok",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_excel",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_instagram",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_ppt",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_zoom",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_amazon",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.11764705882352941,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_3d_model_texturing",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_subject",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_unmask",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_mask",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_aesthetics",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_motion_guided_editing",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_control",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_semantics",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.21811224489795916,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.1498015873015873,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ball_cup_swap_3",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geographic_remote_sensing_land_cover",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visual_prediction_rater_novel_view_synthesis",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_semantic_segmentation",
+ "score": 0.041666666666666664,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_panoptic_segmentation",
+ "score": 0.047619047619047616,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_surface_normal_estimation",
+ "score": 0.1904761904761905,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_openable_part_segmentation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_plane_segmentation",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_depth_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_3d_assembled_quality_understanding",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.028571428571428574,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.05714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.08571428571428572,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.09999999999999999,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.8103448275862066,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.47142857142857136,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.6620689655172414,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.46896551724137925,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sceneqa_scene_transition_video",
+ "score": 0.32142857142857145,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "wikihow_complex_task_completion",
+ "score": 0.6888888888888888,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "funqa_unexpected_action_humor_video",
+ "score": 0.3,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.8571428571428573,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.35000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.7965517241379312,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "activitynetqa",
+ "score": 0.5157894736842106,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "generated_video_artifacts",
+ "score": 0.21874999999999997,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "paper_review_writing",
+ "score": 0.5266666666666667,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.5666666666666668,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_creative_video",
+ "score": 0.31333333333333335,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.7551724137931034,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.7275862068965517,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.7551724137931035,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "doc_vqa",
+ "score": 0.8375000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "video2notes",
+ "score": 0.5428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.5758620689655174,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_short_title",
+ "score": 0.6428571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.49999999999999994,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_oe",
+ "score": 0.32105263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.5214285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_summary",
+ "score": 0.6071428571428573,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.48275862068965514,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.25,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_content_follow_up",
+ "score": 0.7071428571428573,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.24999999999999997,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_qa",
+ "score": 0.8285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.75,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.6071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.172,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.7000000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.47419354838709676,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_magic_video",
+ "score": 0.5133333333333333,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.7448275862068966,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.7736842105263159,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_detail_description",
+ "score": 0.37894736842105275,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.6714285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.4,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.34285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.22142857142857145,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.3928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.2785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.2642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.6642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.08571428571428573,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.35714285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.14285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Spanish",
+ "score": 0.2928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_French",
+ "score": 0.32142857142857145,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Russian",
+ "score": 0.2357142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Chinese",
+ "score": 0.5285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Arabic",
+ "score": 0.3714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "red_teaming_visual_order_B",
+ "score": 0.8894736842105264,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.836842105263158,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.7750000000000002,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.7000000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.41,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_A",
+ "score": 0.8842105263157897,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.7200000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/Default/Claude_3.5/summary_results.json b/static/eval_results/Default/Claude_3.5/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..d90792e8314fea5c53f068d815fd6ebdff3bd724
--- /dev/null
+++ b/static/eval_results/Default/Claude_3.5/summary_results.json
@@ -0,0 +1,251 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 440,
+ "num_eval_samples": 6539,
+ "macro_mean_score": 0.5040975742801586,
+ "micro_mean_score": 0.5002259116666758
+ },
+ "open": {
+ "num_eval_tasks": 65,
+ "num_eval_samples": 1163,
+ "macro_mean_score": 0.6373907158949892,
+ "micro_mean_score": 0.6569647463456579
+ },
+ "overall_score": 0.5212541172602853
+ },
+ "keyword_stats": {
+ "skills": {
+ "Object Recognition and Classification": {
+ "count": 303,
+ "num_samples": 4755,
+ "tasks": [],
+ "average_score": 0.5405089647404562
+ },
+ "Text Recognition (OCR)": {
+ "count": 137,
+ "num_samples": 2239,
+ "tasks": [],
+ "average_score": 0.6082834220752651
+ },
+ "Language Understanding and Generation": {
+ "count": 154,
+ "num_samples": 2509,
+ "tasks": [],
+ "average_score": 0.5745077617490254
+ },
+ "Scene and Event Understanding": {
+ "count": 154,
+ "num_samples": 2467,
+ "tasks": [],
+ "average_score": 0.5450038475783499
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 109,
+ "num_samples": 1910,
+ "tasks": [],
+ "average_score": 0.4767692987630454
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 51,
+ "num_samples": 855,
+ "tasks": [],
+ "average_score": 0.5756126284078804
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 15,
+ "num_samples": 245,
+ "tasks": [],
+ "average_score": 0.6969774436090224
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 77,
+ "num_samples": 1386,
+ "tasks": [],
+ "average_score": 0.5278843049497918
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 152,
+ "num_samples": 2437,
+ "tasks": [],
+ "average_score": 0.4082144793870471
+ },
+ "Planning and Decision Making": {
+ "count": 37,
+ "num_samples": 577,
+ "tasks": [],
+ "average_score": 0.23803578664609892
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 93,
+ "num_samples": 1517,
+ "tasks": [],
+ "average_score": 0.5691641481808987
+ },
+ "Text-Based Images and Documents": {
+ "count": 82,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.4795267886975966
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 101,
+ "num_samples": 1718,
+ "tasks": [],
+ "average_score": 0.525848282456283
+ },
+ "Videos": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.508735695828719
+ },
+ "Artistic and Creative Content": {
+ "count": 32,
+ "num_samples": 541,
+ "tasks": [],
+ "average_score": 0.5699094130430454
+ },
+ "Photographs": {
+ "count": 143,
+ "num_samples": 2248,
+ "tasks": [],
+ "average_score": 0.5096772701625744
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 11,
+ "num_samples": 169,
+ "tasks": [],
+ "average_score": 0.4429640420975014
+ }
+ },
+ "output_format": {
+ "contextual_formatted_text": {
+ "count": 98,
+ "num_samples": 1514,
+ "tasks": [],
+ "average_score": 0.5066797418318023
+ },
+ "structured_output": {
+ "count": 110,
+ "num_samples": 1714,
+ "tasks": [],
+ "average_score": 0.4971460788134188
+ },
+ "exact_text": {
+ "count": 83,
+ "num_samples": 1278,
+ "tasks": [],
+ "average_score": 0.5278127103234661
+ },
+ "numerical_data": {
+ "count": 49,
+ "num_samples": 862,
+ "tasks": [],
+ "average_score": 0.4490020843308984
+ },
+ "open_ended_output": {
+ "count": 80,
+ "num_samples": 1454,
+ "tasks": [],
+ "average_score": 0.5838224169821388
+ },
+ "multiple_choice": {
+ "count": 85,
+ "num_samples": 1363,
+ "tasks": [],
+ "average_score": 0.5456152399978661
+ }
+ },
+ "input_num": {
+ "6-8 images": {
+ "count": 21,
+ "num_samples": 314,
+ "tasks": [],
+ "average_score": 0.46300075585789874
+ },
+ "9-image or more": {
+ "count": 41,
+ "num_samples": 623,
+ "tasks": [],
+ "average_score": 0.5414381873407914
+ },
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.5373019912310933
+ },
+ "video": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.508735695828719
+ },
+ "4-5 images": {
+ "count": 34,
+ "num_samples": 520,
+ "tasks": [],
+ "average_score": 0.4422556748863689
+ },
+ "2-3 images": {
+ "count": 51,
+ "num_samples": 802,
+ "tasks": [],
+ "average_score": 0.49311554035078103
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 72,
+ "num_samples": 1124,
+ "tasks": [],
+ "average_score": 0.6663170946790707
+ },
+ "Planning": {
+ "count": 78,
+ "num_samples": 1239,
+ "tasks": [],
+ "average_score": 0.3382015835012861
+ },
+ "Coding": {
+ "count": 31,
+ "num_samples": 474,
+ "tasks": [],
+ "average_score": 0.5194010220575684
+ },
+ "Perception": {
+ "count": 145,
+ "num_samples": 2313,
+ "tasks": [],
+ "average_score": 0.532329797132399
+ },
+ "Metrics": {
+ "count": 20,
+ "num_samples": 309,
+ "tasks": [],
+ "average_score": 0.5808831682303479
+ },
+ "Science": {
+ "count": 29,
+ "num_samples": 574,
+ "tasks": [],
+ "average_score": 0.513474611293123
+ },
+ "Knowledge": {
+ "count": 97,
+ "num_samples": 1605,
+ "tasks": [],
+ "average_score": 0.5507075880782885
+ },
+ "Mathematics": {
+ "count": 33,
+ "num_samples": 547,
+ "tasks": [],
+ "average_score": 0.47461998432626556
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/Default/Claude_3.5/task_results.json b/static/eval_results/Default/Claude_3.5/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b2bbfdc4b11a52bdb098938b20cf495eb95baa2e
--- /dev/null
+++ b/static/eval_results/Default/Claude_3.5/task_results.json
@@ -0,0 +1,7756 @@
+[
+ {
+ "name": "code_translation_Python",
+ "score": 0.6458333333333334,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "vln_identify_robot",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "google_streetview_line_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "dish_ingredient_match",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vln_identify_location",
+ "score": 0.3878787878787879,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "video_eval_visual_pref",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.47189890122171807,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_homepage_profile",
+ "score": 0.4997371675943104,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.8928571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_basic",
+ "score": 0.8823529411764706,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "booking_web_rating",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_sign_recognition",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_content_based_retrieval_radiology",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "planning_visual_blocksworld",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "photoshop_operation",
+ "score": 0.32857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "pokemon_3D_recognition",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multilingual_news_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "worldle",
+ "score": 0.31144102130193474,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "rocks_samples_identify",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TV_show_retrieval_by_character",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "planning_visual_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "multiview_reasoning_camera_moving",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visual_correspondance_in_two_images",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_segments_reordering",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "code_match_problem",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "media_recommend_solutions_stackoverflow",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "code_translation_advanced",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "planning_visual_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_reasoning",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.6666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_english_next_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "sign_language",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_grounding_spatial",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "move_pos_to_pos_hanoi_4_pole",
+ "score": 0.003968253968253968,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.9642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "2d_image_jigsaw_puzzle_easy",
+ "score": 0.3428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_polyp_segmentation_single_object_rater",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.7172619047619049,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.7777777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_direction_understanding",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.06698805429719713,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.42424242424242425,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Forensic_Detection_of_different_images",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_event",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_solution_compare",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "functionality_matching_in_different_objects",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0035714285714285718,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_line_reasoning",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ancient_map_understanding",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rocks_samples_compare",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.5495098039215687,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_2D_views_of_3D_shapes",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "music_sheet_sentiment",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "vln_tegulu_next_step",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "paper_review_rating",
+ "score": 0.6543300312736264,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "video_camera_motion_description",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "video"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.07140372068949602,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_temporal",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_intent_recognition",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "counting_multi_image",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_action_recognition",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.5487385867546344,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_factual_pref",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.8235294117647058,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_easy",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_recommendation",
+ "score": 0.7624716553287981,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_author",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.9583333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.3157894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_ood_global_image_retrieval_by_query_property",
+ "score": 0.8601190476190477,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.7448979591836732,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cheapest_flight_identification",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.6787142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.7894736842105263,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "waldo",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "number_comparison",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_to_camera_trajectory_retrieval",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_keywords_based_retrieval_non_radiology",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.7321428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_acceptance",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.5416666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_object_shuffle_video",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "face_identity_matching",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.22448979591836735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_parasite_detection",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "places365_similar_scene_retrieval",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_load_type_prediction_from_plot",
+ "score": 0.46428571428571425,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "code_visualization_output_understanding",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 10,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.09714285714285713,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Movie_retrieval_by_actor",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "rebus",
+ "score": 0.5217391304347826,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "muma_theory_of_mind_social_goal",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "comic_page_ordering",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.23529411764705882,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.9747899159663866,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.8250714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_moving_direction_video",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_eval_dynamic_pref",
+ "score": 0.875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_fragments_understanding",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vizwiz_quality_accessment_for_blind",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "logical_reasoning_2d_folding",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.6207368421052633,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_hindi_next_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "muma_theory_of_mind_belief_of_goal",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.2688508092335989,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_retrieval",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.7853333333333332,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vlnqa_egocentric_navigation_video",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.7053571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_hard",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.8303571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_sequence",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.6904761904761906,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_vehicle_multiview_counting",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "arc_agi",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "IAM_line_ocr_and_locate",
+ "score": 0.8184827502429544,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_human_multiview_counting",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "clevrer_object_existence_video",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.84375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "emotion_recognition",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.7698412698412698,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.7095421052631579,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_retrieval",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_check_two_images",
+ "score": 0.7777777777777779,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "nextqa_mc",
+ "score": 0.8421052631578947,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_hamiltonian_cycle",
+ "score": 0.37499999999999994,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "stock_info_retrieval",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_prediction",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_view_identification",
+ "score": 0.27777777777777773,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_advanced",
+ "score": 0.24074074074074073,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_programming_extremely_hard",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_video_character_order",
+ "score": 0.6875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.5565966568582713,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.04739437903890144,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.5987447167547407,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_retrieval",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "algebra",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_video_moving_object_count",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 21,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.5753130452443872,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.9087301587301589,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.9415584415584416,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cam_traj_to_video_selection",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "clevrer_video_moving_object_property_recognition",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_error_line_identification",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.333520279485717,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "photo_sharing_image_retrieval",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.7636842105263157,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.7071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sta_action_localization_video",
+ "score": 0.3125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.5531252543894322,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_content_reasoning",
+ "score": 0.7777777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_T2I_reward",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_contain_images",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat_position_only_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_position_images",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_xor_images",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_position_only",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.8095238095238094,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.9047619047619049,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.15,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_leetcode",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_instagram",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_iphone_settings",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_ppt",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_notes",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_amazon",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_excel",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_youtube",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_twitter",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_alipay",
+ "score": 0.6470588235294118,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_zoom",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_word",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_tiktok",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.7714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ball_cup_swap_3",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "music_sheet_name",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "code_retrieval",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "recipe_image_ordering",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "semantic_matching_of_two_images",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_view",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_grippers",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.82,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logo2k_same_type_logo_retrieval",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "clevr_arithmetic",
+ "score": 0.631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualdial_visual_dialog_image_guessing",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_path",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "perception_test_video_action_count",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "star_object_interaction_video",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_retrieval",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.9841571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_multicamera_tracking_predict_bbox",
+ "score": 0.03751549483739501,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.3397142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "dvqa",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_3d_model_texturing",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_aesthetics",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_subject",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_motion_guided_editing",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "autorater_unmask",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_semantics",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_mask",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_control",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.5346938775510204,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.4522108843537415,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.5529411764705884,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_abdomen_MRI_organ_recognition",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_abdomen_endscopy_organ_recognition",
+ "score": 0.34523809523809523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_retrieval_given_surgeon_activity",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Videos",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.8235294117647058,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_depth_estimation",
+ "score": 0.47619047619047616,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_plane_segmentation",
+ "score": 0.6222222222222221,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_openable_part_segmentation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_panoptic_segmentation",
+ "score": 0.5952380952380952,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_surface_normal_estimation",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_3d_assembled_quality_understanding",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visual_prediction_rater_novel_view_synthesis",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_semantic_segmentation",
+ "score": 0.4166666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.8142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.7,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.5857142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.7,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geographic_remote_sensing_land_cover",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video_motion_matching_3D_real",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "video_motion_matching_real_3D",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_output_result",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.3157894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "painting_QA",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_add_tag",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.8450000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.2857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.7222222222222222,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.8142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_summary",
+ "score": 0.6357142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "funqa_unexpected_action_magic_video",
+ "score": 0.58,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "paper_review_writing",
+ "score": 0.58,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "activitynetqa",
+ "score": 0.4,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.258,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "generated_video_artifacts",
+ "score": 0.325,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.6928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "wikihow_complex_task_completion",
+ "score": 0.8444444444444446,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_detail_description",
+ "score": 0.38947368421052636,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "funqa_unexpected_action_creative_video",
+ "score": 0.32666666666666655,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.8473684210526317,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.5357142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_qa",
+ "score": 0.8857142857142859,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.4928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_oe",
+ "score": 0.3684210526315789,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_short_title",
+ "score": 0.7500000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video2notes",
+ "score": 0.7142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.8931034482758619,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.882758620689655,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.8551724137931034,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.8827586206896549,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.3285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.7307692307692307,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "doc_vqa",
+ "score": 0.8750000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.7928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.5866666666666666,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_content_follow_up",
+ "score": 0.8500000000000002,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.45806451612903226,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.6482758620689654,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.8931034482758619,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.6499999999999998,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.7517241379310345,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.6931034482758621,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_humor_video",
+ "score": 0.3733333333333333,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.8310344827586205,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.8551724137931035,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.9357142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.65,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.7071428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.9349999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.8850000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_B",
+ "score": 0.905263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.8100000000000002,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_A",
+ "score": 0.905263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8789473684210528,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.33571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.3071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.4142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Spanish",
+ "score": 0.5428571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Russian",
+ "score": 0.7000000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Arabic",
+ "score": 0.6071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Chinese",
+ "score": 0.6928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_French",
+ "score": 0.6357142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.4928571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.4714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.4357142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.5785714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.5714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.47857142857142854,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sceneqa_scene_transition_video",
+ "score": 0.32857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/Default/Claude_3.5_new/summary_results.json b/static/eval_results/Default/Claude_3.5_new/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b27da6920bcbd055a5c65f822bb65e8153eeedae
--- /dev/null
+++ b/static/eval_results/Default/Claude_3.5_new/summary_results.json
@@ -0,0 +1,251 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 440,
+ "num_eval_samples": 6539,
+ "macro_mean_score": 0.5259191914020757,
+ "micro_mean_score": 0.5230785894131227
+ },
+ "open": {
+ "num_eval_tasks": 65,
+ "num_eval_samples": 1163,
+ "macro_mean_score": 0.6563419761104125,
+ "micro_mean_score": 0.6724419604471196
+ },
+ "overall_score": 0.5427062825031487
+ },
+ "keyword_stats": {
+ "skills": {
+ "Object Recognition and Classification": {
+ "count": 303,
+ "num_samples": 4755,
+ "tasks": [],
+ "average_score": 0.5690045172520449
+ },
+ "Text Recognition (OCR)": {
+ "count": 137,
+ "num_samples": 2239,
+ "tasks": [],
+ "average_score": 0.6220681231036606
+ },
+ "Language Understanding and Generation": {
+ "count": 154,
+ "num_samples": 2509,
+ "tasks": [],
+ "average_score": 0.6077980666415158
+ },
+ "Scene and Event Understanding": {
+ "count": 154,
+ "num_samples": 2467,
+ "tasks": [],
+ "average_score": 0.5511440615639541
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 109,
+ "num_samples": 1910,
+ "tasks": [],
+ "average_score": 0.4885536652013625
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 51,
+ "num_samples": 855,
+ "tasks": [],
+ "average_score": 0.5908204006544897
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 15,
+ "num_samples": 245,
+ "tasks": [],
+ "average_score": 0.6569473684210526
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 77,
+ "num_samples": 1386,
+ "tasks": [],
+ "average_score": 0.5486763511384175
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 152,
+ "num_samples": 2437,
+ "tasks": [],
+ "average_score": 0.4315385951907387
+ },
+ "Planning and Decision Making": {
+ "count": 37,
+ "num_samples": 577,
+ "tasks": [],
+ "average_score": 0.2909419331017877
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 93,
+ "num_samples": 1517,
+ "tasks": [],
+ "average_score": 0.6048192628845258
+ },
+ "Text-Based Images and Documents": {
+ "count": 82,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.48924295292319175
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 101,
+ "num_samples": 1718,
+ "tasks": [],
+ "average_score": 0.556418710368288
+ },
+ "Videos": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.4946691340754988
+ },
+ "Artistic and Creative Content": {
+ "count": 32,
+ "num_samples": 541,
+ "tasks": [],
+ "average_score": 0.5558756390298104
+ },
+ "Photographs": {
+ "count": 143,
+ "num_samples": 2248,
+ "tasks": [],
+ "average_score": 0.5425198547046186
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 11,
+ "num_samples": 169,
+ "tasks": [],
+ "average_score": 0.44210335381541843
+ }
+ },
+ "output_format": {
+ "contextual_formatted_text": {
+ "count": 98,
+ "num_samples": 1514,
+ "tasks": [],
+ "average_score": 0.5187252051932875
+ },
+ "structured_output": {
+ "count": 110,
+ "num_samples": 1714,
+ "tasks": [],
+ "average_score": 0.5071121107460066
+ },
+ "exact_text": {
+ "count": 83,
+ "num_samples": 1278,
+ "tasks": [],
+ "average_score": 0.5387340524651681
+ },
+ "numerical_data": {
+ "count": 49,
+ "num_samples": 862,
+ "tasks": [],
+ "average_score": 0.4824302644151348
+ },
+ "open_ended_output": {
+ "count": 80,
+ "num_samples": 1454,
+ "tasks": [],
+ "average_score": 0.6242798397166945
+ },
+ "multiple_choice": {
+ "count": 85,
+ "num_samples": 1363,
+ "tasks": [],
+ "average_score": 0.5782691045270721
+ }
+ },
+ "input_num": {
+ "6-8 images": {
+ "count": 21,
+ "num_samples": 314,
+ "tasks": [],
+ "average_score": 0.4630277507828528
+ },
+ "9-image or more": {
+ "count": 41,
+ "num_samples": 623,
+ "tasks": [],
+ "average_score": 0.5914338446093256
+ },
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.5636254729390459
+ },
+ "video": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.4946691340754988
+ },
+ "4-5 images": {
+ "count": 34,
+ "num_samples": 520,
+ "tasks": [],
+ "average_score": 0.4828123870640382
+ },
+ "2-3 images": {
+ "count": 51,
+ "num_samples": 802,
+ "tasks": [],
+ "average_score": 0.48756636014597515
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 72,
+ "num_samples": 1124,
+ "tasks": [],
+ "average_score": 0.6590137441693218
+ },
+ "Planning": {
+ "count": 78,
+ "num_samples": 1239,
+ "tasks": [],
+ "average_score": 0.39901670035164916
+ },
+ "Coding": {
+ "count": 31,
+ "num_samples": 474,
+ "tasks": [],
+ "average_score": 0.5166853031535193
+ },
+ "Perception": {
+ "count": 145,
+ "num_samples": 2313,
+ "tasks": [],
+ "average_score": 0.5561634744977417
+ },
+ "Metrics": {
+ "count": 20,
+ "num_samples": 309,
+ "tasks": [],
+ "average_score": 0.6123769274172342
+ },
+ "Science": {
+ "count": 29,
+ "num_samples": 574,
+ "tasks": [],
+ "average_score": 0.5512015158810595
+ },
+ "Knowledge": {
+ "count": 97,
+ "num_samples": 1605,
+ "tasks": [],
+ "average_score": 0.565796566886933
+ },
+ "Mathematics": {
+ "count": 33,
+ "num_samples": 547,
+ "tasks": [],
+ "average_score": 0.4763267502912362
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/Default/Claude_3.5_new/task_results.json b/static/eval_results/Default/Claude_3.5_new/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..9d6ca93f5f2b7acb7f4f83f3d7d701df0fea1f95
--- /dev/null
+++ b/static/eval_results/Default/Claude_3.5_new/task_results.json
@@ -0,0 +1,7756 @@
+[
+ {
+ "name": "vln_identify_robot",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "code_translation_easy",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "dish_ingredient_match",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_visual_pref",
+ "score": 0.875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "booking_web_recommendation",
+ "score": 0.6753968253968254,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_author",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_basic",
+ "score": 0.7058823529411765,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.23684210526315788,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_sign_recognition",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "code_translation_Python",
+ "score": 0.6041666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_line_sorting",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_location",
+ "score": 0.21818181818181817,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.40241040325976846,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_content_based_retrieval_radiology",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_homepage_profile",
+ "score": 0.4714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "photoshop_operation",
+ "score": 0.4095238095238095,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_news_qa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rocks_samples_identify",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "booking_web_rating",
+ "score": 0.9642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visual_correspondance_in_two_images",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_translation_hard",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_segments_reordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_recommend_solutions_stackoverflow",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "planning_visual_blocksworld",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "pokemon_3D_recognition",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "medical_abdomen_MRI_organ_recognition",
+ "score": 0.3154761904761905,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "worldle",
+ "score": 0.32335405958224,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_english_next_step",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "sign_language",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "2d_image_jigsaw_puzzle_easy",
+ "score": 0.2928571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "counting",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TV_show_retrieval_by_character",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "painting_QA",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "multiview_reasoning_camera_moving",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_match_problem",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "code_translation_advanced",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "google_streetview_circle_reasoning",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.5952380952380951,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.5151515151515151,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_spatial",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_event",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "move_pos_to_pos_hanoi_4_pole",
+ "score": 0.003968253968253968,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.40476190476190477,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_solution_compare",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "functionality_matching_in_different_objects",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_polyp_segmentation_single_object_rater",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.9642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.6607142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.7777777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_direction_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.4946078431372549,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.06110399705595322,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_sentiment",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_rating",
+ "score": 0.6764671197732565,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "Forensic_Detection_of_different_images",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_add_tag",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.003401360544217687,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting_multi_image",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_line_reasoning",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_action_recognition",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_abdomen_endscopy_organ_recognition",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.5750644816731951,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ancient_map_understanding",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.7647058823529411,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rocks_samples_compare",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "logical_reasoning_2D_views_of_3D_shapes",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "semantic_matching_of_two_images",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_view",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "vln_tegulu_next_step",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_camera_motion_description",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "video"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.02989318393830872,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_temporal",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.24489795918367346,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_intent_recognition",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_grippers",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_ood_global_image_retrieval_by_query_property",
+ "score": 0.7452380952380953,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.6530612244897959,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_factual_pref",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "cheapest_flight_identification",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.7402142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.7368421052631579,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_to_camera_trajectory_retrieval",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.7589285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.4583333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.9583333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_object_shuffle_video",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "face_identity_matching",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_similar_scene_retrieval",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_load_type_prediction_from_plot",
+ "score": 0.5357142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "waldo",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_comparison",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_keywords_based_retrieval_non_radiology",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "paper_review_acceptance",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "muma_theory_of_mind_social_goal",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.42105263157894735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.9747899159663866,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.8218571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.32653061224489793,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_moving_direction_video",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_parasite_detection",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_fragments_understanding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_visualization_output_understanding",
+ "score": 0.3,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 10,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "vizwiz_quality_accessment_for_blind",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.18566544566544566,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.8389999999999999,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Movie_retrieval_by_actor",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rebus",
+ "score": 0.5217391304347826,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "comic_page_ordering",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "muma_theory_of_mind_belief_of_goal",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.17647058823529413,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.26289170215820523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_retrieval_given_surgeon_activity",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Videos",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "weather_info_retrieval",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.868,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_dynamic_pref",
+ "score": 0.8125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "vlnqa_egocentric_navigation_video",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.6875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_2d_folding",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "code_programming_test_hard",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_hindi_next_step",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.6875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_name",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.711111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_retrieval",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "action_sequence",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.7261904761904762,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "recipe_image_ordering",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "av_vehicle_multiview_counting",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "arc_agi",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "IAM_line_ocr_and_locate",
+ "score": 0.8199708454810496,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_human_multiview_counting",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_object_existence_video",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.6805363628538211,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.78125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_retrieval",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "emotion_recognition",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.8650793650793652,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "algebra",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.8811526315789474,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_video_moving_object_count",
+ "score": 0.5238095238095238,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 21,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.6339993725717702,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_retrieval",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.4444444444444445,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_check_two_images",
+ "score": 0.4444444444444445,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.9166666666666669,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_output_result",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_mc",
+ "score": 0.8947368421052632,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_cycle",
+ "score": 0.4053571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.9480519480519481,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_retrieval",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "cam_traj_to_video_selection",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_video_moving_object_property_recognition",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_prediction",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_view_identification",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "code_error_line_identification",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.2531109353882501,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "photo_sharing_image_retrieval",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "code_programming_test_advanced",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.7710526315789472,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_extremely_hard",
+ "score": 0.09375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.7642857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_video_character_order",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "sta_action_localization_video",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.6062431664706708,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.08106406283795066,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.6274393183836207,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_content_reasoning",
+ "score": 0.7777777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.82,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logo2k_same_type_logo_retrieval",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevr_arithmetic",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "reward_models_T2I_reward",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_images",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visualdial_visual_dialog_image_guessing",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat_position_only_length",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_position_images",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_path",
+ "score": 0.2912414965986394,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "constrained_generation_xor_images",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "perception_test_video_action_count",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "star_object_interaction_video",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_position_only",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_retrieval",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.9743499999999999,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_multicamera_tracking_predict_bbox",
+ "score": 0.02306400619990589,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.9285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.23921428571428613,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "dvqa",
+ "score": 0.9473684210526315,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_leetcode",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.5666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_instagram",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_iphone_settings",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_ppt",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_notes",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_amazon",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_excel",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_youtube",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.5529411764705883,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_twitter",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_alipay",
+ "score": 0.7058823529411765,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_zoom",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_word",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_tiktok",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.6928571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ball_cup_swap_3",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.8235294117647058,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_3d_model_texturing",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_aesthetics",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_subject",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_motion_guided_editing",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "autorater_unmask",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_semantics",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video_motion_matching_3D_real",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "autorater_mask",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_control",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video_motion_matching_real_3D",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_depth_estimation",
+ "score": 0.5952380952380951,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_plane_segmentation",
+ "score": 0.48888888888888893,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_openable_part_segmentation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_panoptic_segmentation",
+ "score": 0.8095238095238095,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_surface_normal_estimation",
+ "score": 0.6666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_3d_assembled_quality_understanding",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.5210884353741496,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_novel_view_synthesis",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.47066326530612246,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_semantic_segmentation",
+ "score": 0.45833333333333326,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.7428571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.6285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.6285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geographic_remote_sensing_land_cover",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.5,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.6888888888888888,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.8642857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sceneqa_scene_transition_video",
+ "score": 0.37857142857142856,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_summary",
+ "score": 0.7,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "funqa_unexpected_action_magic_video",
+ "score": 0.6133333333333333,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "paper_review_writing",
+ "score": 0.6133333333333333,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "activitynetqa",
+ "score": 0.4,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.282,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "generated_video_artifacts",
+ "score": 0.27499999999999997,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.6357142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "wikihow_complex_task_completion",
+ "score": 0.8777777777777779,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_detail_description",
+ "score": 0.42105263157894735,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "funqa_unexpected_action_creative_video",
+ "score": 0.38666666666666666,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.8578947368421055,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.7142857142857144,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_qa",
+ "score": 0.892857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.9103448275862066,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.6000000000000002,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_oe",
+ "score": 0.3631578947368421,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.8620689655172412,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.8586206896551724,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.893103448275862,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_short_title",
+ "score": 0.8142857142857144,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video2notes",
+ "score": 0.692857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.43571428571428567,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.7038461538461539,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "doc_vqa",
+ "score": 0.9249999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.7928571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.7266666666666666,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_content_follow_up",
+ "score": 0.8428571428571431,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.4580645161290323,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.6931034482758621,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.8344827586206894,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.7071428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.8379310344827589,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.7413793103448276,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_humor_video",
+ "score": 0.2866666666666667,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.786206896551724,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.8379310344827587,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.942857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.6642857142857144,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.7785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.9650000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.825,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_B",
+ "score": 0.905263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.7699999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_A",
+ "score": 0.8789473684210528,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.8350000000000002,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8842105263157897,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.29999999999999993,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.32142857142857145,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.35,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Spanish",
+ "score": 0.5428571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Russian",
+ "score": 0.5785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Arabic",
+ "score": 0.65,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Chinese",
+ "score": 0.6285714285714284,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_French",
+ "score": 0.5857142857142856,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.607142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.6214285714285713,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.49999999999999994,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.6071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.5214285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.5928571428571427,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/Default/GPT_4o/summary_results.json b/static/eval_results/Default/GPT_4o/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..6af57dc0f78b6677c89cf6d73a5396b2d10b16f8
--- /dev/null
+++ b/static/eval_results/Default/GPT_4o/summary_results.json
@@ -0,0 +1,251 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 440,
+ "num_eval_samples": 6539,
+ "macro_mean_score": 0.5265030595065238,
+ "micro_mean_score": 0.5236338521693411
+ },
+ "open": {
+ "num_eval_tasks": 65,
+ "num_eval_samples": 1163,
+ "macro_mean_score": 0.6478225794744895,
+ "micro_mean_score": 0.665391229578676
+ },
+ "overall_score": 0.5421184432647768
+ },
+ "keyword_stats": {
+ "skills": {
+ "Object Recognition and Classification": {
+ "count": 303,
+ "num_samples": 4755,
+ "tasks": [],
+ "average_score": 0.5630758211022604
+ },
+ "Text Recognition (OCR)": {
+ "count": 137,
+ "num_samples": 2239,
+ "tasks": [],
+ "average_score": 0.6216411634729735
+ },
+ "Language Understanding and Generation": {
+ "count": 154,
+ "num_samples": 2509,
+ "tasks": [],
+ "average_score": 0.616018277142757
+ },
+ "Scene and Event Understanding": {
+ "count": 154,
+ "num_samples": 2467,
+ "tasks": [],
+ "average_score": 0.5823101249498799
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 109,
+ "num_samples": 1910,
+ "tasks": [],
+ "average_score": 0.44177544539510955
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 51,
+ "num_samples": 855,
+ "tasks": [],
+ "average_score": 0.6345458069232931
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 15,
+ "num_samples": 245,
+ "tasks": [],
+ "average_score": 0.6795263157894738
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 77,
+ "num_samples": 1386,
+ "tasks": [],
+ "average_score": 0.5514924675940659
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 152,
+ "num_samples": 2437,
+ "tasks": [],
+ "average_score": 0.39435038953269674
+ },
+ "Planning and Decision Making": {
+ "count": 37,
+ "num_samples": 577,
+ "tasks": [],
+ "average_score": 0.22934807257231926
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 93,
+ "num_samples": 1517,
+ "tasks": [],
+ "average_score": 0.608083455060831
+ },
+ "Text-Based Images and Documents": {
+ "count": 82,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.491325251564869
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 101,
+ "num_samples": 1718,
+ "tasks": [],
+ "average_score": 0.4999089647103332
+ },
+ "Videos": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.5315979872161023
+ },
+ "Artistic and Creative Content": {
+ "count": 32,
+ "num_samples": 541,
+ "tasks": [],
+ "average_score": 0.5641404607063637
+ },
+ "Photographs": {
+ "count": 143,
+ "num_samples": 2248,
+ "tasks": [],
+ "average_score": 0.5613545677222056
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 11,
+ "num_samples": 169,
+ "tasks": [],
+ "average_score": 0.47760591698367955
+ }
+ },
+ "output_format": {
+ "contextual_formatted_text": {
+ "count": 98,
+ "num_samples": 1514,
+ "tasks": [],
+ "average_score": 0.5388690453811203
+ },
+ "structured_output": {
+ "count": 110,
+ "num_samples": 1714,
+ "tasks": [],
+ "average_score": 0.48037685656449847
+ },
+ "exact_text": {
+ "count": 83,
+ "num_samples": 1278,
+ "tasks": [],
+ "average_score": 0.5994159671881645
+ },
+ "numerical_data": {
+ "count": 49,
+ "num_samples": 862,
+ "tasks": [],
+ "average_score": 0.44606605087301393
+ },
+ "open_ended_output": {
+ "count": 80,
+ "num_samples": 1454,
+ "tasks": [],
+ "average_score": 0.6274371950293718
+ },
+ "multiple_choice": {
+ "count": 85,
+ "num_samples": 1363,
+ "tasks": [],
+ "average_score": 0.5448877153826162
+ }
+ },
+ "input_num": {
+ "6-8 images": {
+ "count": 21,
+ "num_samples": 314,
+ "tasks": [],
+ "average_score": 0.4751133786848073
+ },
+ "9-image or more": {
+ "count": 41,
+ "num_samples": 623,
+ "tasks": [],
+ "average_score": 0.5343350103400748
+ },
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.5672657028463585
+ },
+ "video": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.5315979872161023
+ },
+ "4-5 images": {
+ "count": 34,
+ "num_samples": 520,
+ "tasks": [],
+ "average_score": 0.4500928191484624
+ },
+ "2-3 images": {
+ "count": 51,
+ "num_samples": 802,
+ "tasks": [],
+ "average_score": 0.4908653289106883
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 72,
+ "num_samples": 1124,
+ "tasks": [],
+ "average_score": 0.7056027785545881
+ },
+ "Planning": {
+ "count": 78,
+ "num_samples": 1239,
+ "tasks": [],
+ "average_score": 0.33202130899313653
+ },
+ "Coding": {
+ "count": 31,
+ "num_samples": 474,
+ "tasks": [],
+ "average_score": 0.5032849161169843
+ },
+ "Perception": {
+ "count": 145,
+ "num_samples": 2313,
+ "tasks": [],
+ "average_score": 0.5510350848991218
+ },
+ "Metrics": {
+ "count": 20,
+ "num_samples": 309,
+ "tasks": [],
+ "average_score": 0.6095778863474799
+ },
+ "Science": {
+ "count": 29,
+ "num_samples": 574,
+ "tasks": [],
+ "average_score": 0.5283797185155754
+ },
+ "Knowledge": {
+ "count": 97,
+ "num_samples": 1605,
+ "tasks": [],
+ "average_score": 0.6135723164021851
+ },
+ "Mathematics": {
+ "count": 33,
+ "num_samples": 547,
+ "tasks": [],
+ "average_score": 0.44047720383044436
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/Default/GPT_4o/task_results.json b/static/eval_results/Default/GPT_4o/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..59a8732af607782d6ba09b3aa9592e8facf3ad7c
--- /dev/null
+++ b/static/eval_results/Default/GPT_4o/task_results.json
@@ -0,0 +1,7756 @@
+[
+ {
+ "name": "code_translation_Python",
+ "score": 0.6458333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "vln_identify_robot",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "google_streetview_line_sorting",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "dish_ingredient_match",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vln_identify_location",
+ "score": 0.32727272727272727,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "video_eval_visual_pref",
+ "score": 0.625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.5564421945052599,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.45,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_homepage_profile",
+ "score": 0.24717887154861945,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_basic",
+ "score": 0.8823529411764706,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_sign_recognition",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_rating",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.5571428571428572,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_content_based_retrieval_radiology",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "photoshop_operation",
+ "score": 0.39642857142857146,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "multilingual_news_qa",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "code_translation_hard",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "rocks_samples_identify",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.8928571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_blocksworld",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "pokemon_3D_recognition",
+ "score": 0.9,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "visual_correspondance_in_two_images",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "worldle",
+ "score": 0.5019920337942146,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video_segments_reordering",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_recommend_solutions_stackoverflow",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TV_show_retrieval_by_character",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "planning_visual_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiview_reasoning_camera_moving",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_english_next_step",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "code_match_problem",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "sign_language",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "2d_image_jigsaw_puzzle_easy",
+ "score": 0.3107142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_translation_advanced",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_reasoning",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.619047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_spatial",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "move_pos_to_pos_hanoi_4_pole",
+ "score": 0.010912698412698412,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.5050505050505051,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_event",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_polyp_segmentation_single_object_rater",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.3571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_solution_compare",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "functionality_matching_in_different_objects",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_direction_understanding",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.4379245788668292,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.40294117647058825,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.1858388265990491,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_sentiment",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "paper_review_rating",
+ "score": 0.6370339174257883,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Forensic_Detection_of_different_images",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting_multi_image",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_action_recognition",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.5925323909834338,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.6470588235294118,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0009041591320072332,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_easy",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_line_reasoning",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "booking_web_recommendation",
+ "score": 0.7803571428571426,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_author",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ancient_map_understanding",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rocks_samples_compare",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.21052631578947367,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_2D_views_of_3D_shapes",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "vln_tegulu_next_step",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_camera_motion_description",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "video"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.01601312748867357,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_temporal",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_intent_recognition",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_factual_pref",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "waldo",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "coco_ood_global_image_retrieval_by_query_property",
+ "score": 0.8583333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "number_comparison",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_keywords_based_retrieval_non_radiology",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.6632653061224488,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_acceptance",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "cheapest_flight_identification",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.4767857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.8947368421052632,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_to_camera_trajectory_retrieval",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.6875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.3673469387755102,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_parasite_detection",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.4583333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_visualization_output_understanding",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 10,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.1496598639455782,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Movie_retrieval_by_actor",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "perception_test_object_shuffle_video",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "rebus",
+ "score": 0.6956521739130435,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "face_identity_matching",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "comic_page_ordering",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.5294117647058824,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_similar_scene_retrieval",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_load_type_prediction_from_plot",
+ "score": 0.4404761904761905,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "video_eval_dynamic_pref",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_2d_folding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "muma_theory_of_mind_social_goal",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "vln_hindi_next_step",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.2631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_name",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.9747899159663866,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_retrieval",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.7872142857142859,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_moving_direction_video",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_fragments_understanding",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "vizwiz_quality_accessment_for_blind",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.6918947368421055,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "recipe_image_ordering",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.6842105263157895,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "muma_theory_of_mind_belief_of_goal",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.2785198065092178,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_retrieval",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.828,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vlnqa_egocentric_navigation_video",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "code_programming_test_hard",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "semantic_matching_of_two_images",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_view",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.8303571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.711111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_sequence",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.95,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.8095238095238095,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_vehicle_multiview_counting",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "arc_agi",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "IAM_line_ocr_and_locate",
+ "score": 0.6275228061577963,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.3469387755102041,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_grippers",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_human_multiview_counting",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "clevrer_object_existence_video",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.5982549376215841,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "emotion_recognition",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.8253968253968255,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_retrieval",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.7131684210526317,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "algebra",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_retrieval",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_video_moving_object_count",
+ "score": 0.5238095238095238,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 21,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.5867591836191252,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_check_two_images",
+ "score": 0.8000000000000002,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.6444444444444445,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.9285714285714288,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.8766233766233764,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_mc",
+ "score": 0.9473684210526315,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "cam_traj_to_video_selection",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_hamiltonian_cycle",
+ "score": 0.37559523809523804,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "clevrer_video_moving_object_property_recognition",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "stock_info_retrieval",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_prediction",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_error_line_identification",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.2903422951989705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_view_identification",
+ "score": 0.32222222222222224,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "photo_sharing_image_retrieval",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.7417368421052631,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_advanced",
+ "score": 0.05555555555555555,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.7142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sta_action_localization_video",
+ "score": 0.3125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_extremely_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.6477943776571286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_content_reasoning",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_video_character_order",
+ "score": 0.9375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.5807339650392197,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.18559785992971775,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_T2I_reward",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.74,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logo2k_same_type_logo_retrieval",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "constrained_generation_contain_contain_images",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "clevr_arithmetic",
+ "score": 0.631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat_position_only_length",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualdial_visual_dialog_image_guessing",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_position_images",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_path",
+ "score": 0.36607142857142866,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "constrained_generation_xor_images",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "perception_test_video_action_count",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "star_object_interaction_video",
+ "score": 0.625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "constrained_generation_multi_contain_position_only",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.8333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_retrieval",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.9764785714285713,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.9047619047619048,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_multicamera_tracking_predict_bbox",
+ "score": 0.038392686848233396,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.12478571428571421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "dvqa",
+ "score": 0.3684210526315789,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.55,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_leetcode",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6000000000000002,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_instagram",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_iphone_settings",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_ppt",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_notes",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.7647058823529411,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_amazon",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_excel",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_youtube",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_twitter",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_alipay",
+ "score": 0.5882352941176471,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_zoom",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_word",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "visual_prediction_rater_depth_estimation",
+ "score": 0.738095238095238,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "app_interactive_operations_tiktok",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.6785714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_plane_segmentation",
+ "score": 0.31111111111111106,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "ball_cup_swap_3",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "visual_prediction_rater_openable_part_segmentation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "autorater_3d_model_texturing",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_aesthetics",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_panoptic_segmentation",
+ "score": 0.47619047619047616,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "autorater_subject",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_motion_guided_editing",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "autorater_unmask",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_semantics",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visual_prediction_rater_surface_normal_estimation",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "autorater_mask",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visual_prediction_rater_3d_assembled_quality_understanding",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_control",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_novel_view_synthesis",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_semantic_segmentation",
+ "score": 0.6041666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.4562925170068027,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.36553287981859406,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.6785714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.7428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.7571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.5428571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geographic_remote_sensing_land_cover",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_abdomen_MRI_organ_recognition",
+ "score": 0.5297619047619048,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_abdomen_endscopy_organ_recognition",
+ "score": 0.24404761904761904,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_retrieval_given_surgeon_activity",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Videos",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.6785714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_motion_matching_3D_real",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "video_motion_matching_real_3D",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_output_result",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "painting_QA",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_add_tag",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.8949999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.9,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.7250000000000002,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.765,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.7578947368421054,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.7142857142857144,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.7214285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.6,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.7071428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.65,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.6785714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Spanish",
+ "score": 0.45,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Russian",
+ "score": 0.5642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Arabic",
+ "score": 0.5285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Chinese",
+ "score": 0.6214285714285713,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_French",
+ "score": 0.4214285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.6777777777777777,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_A",
+ "score": 0.905263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.9142857142857145,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_summary",
+ "score": 0.7357142857142855,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "funqa_unexpected_action_magic_video",
+ "score": 0.6333333333333332,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "paper_review_writing",
+ "score": 0.5533333333333333,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "activitynetqa",
+ "score": 0.7052631578947368,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.36200000000000004,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "generated_video_artifacts",
+ "score": 0.35,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.6928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "wikihow_complex_task_completion",
+ "score": 0.8666666666666667,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_detail_description",
+ "score": 0.5684210526315789,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "funqa_unexpected_action_creative_video",
+ "score": 0.2866666666666667,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.8315789473684211,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.37857142857142856,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_qa",
+ "score": 0.8785714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.6642857142857144,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_oe",
+ "score": 0.4421052631578948,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_short_title",
+ "score": 0.7357142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video2notes",
+ "score": 0.7285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.3,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.7423076923076924,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "doc_vqa",
+ "score": 0.8250000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.8428571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.8666666666666668,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_content_follow_up",
+ "score": 0.835714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.49354838709677434,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.6,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_humor_video",
+ "score": 0.5533333333333332,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.9214285714285716,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.6357142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.6785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_B",
+ "score": 0.905263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.2857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.2785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.3571428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.8620689655172411,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.8310344827586206,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.8793103448275862,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.8689655172413793,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.7310344827586206,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.9068965517241377,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.6172413793103447,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.627586206896552,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.8310344827586207,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.8275862068965518,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sceneqa_scene_transition_video",
+ "score": 0.2928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/Default/GPT_4o_mini/summary_results.json b/static/eval_results/Default/GPT_4o_mini/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e4ea03c1e4c0224b18df4676d6f3f1b2bbef39af
--- /dev/null
+++ b/static/eval_results/Default/GPT_4o_mini/summary_results.json
@@ -0,0 +1,251 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 440,
+ "num_eval_samples": 6539,
+ "macro_mean_score": 0.40767494558789397,
+ "micro_mean_score": 0.40431644154143376
+ },
+ "open": {
+ "num_eval_tasks": 65,
+ "num_eval_samples": 1163,
+ "macro_mean_score": 0.586537827213665,
+ "micro_mean_score": 0.6133276010318144
+ },
+ "overall_score": 0.43069690064863675
+ },
+ "keyword_stats": {
+ "skills": {
+ "Object Recognition and Classification": {
+ "count": 303,
+ "num_samples": 4755,
+ "tasks": [],
+ "average_score": 0.4492982787524939
+ },
+ "Text Recognition (OCR)": {
+ "count": 137,
+ "num_samples": 2239,
+ "tasks": [],
+ "average_score": 0.49026056071002017
+ },
+ "Language Understanding and Generation": {
+ "count": 154,
+ "num_samples": 2509,
+ "tasks": [],
+ "average_score": 0.5168957112681365
+ },
+ "Scene and Event Understanding": {
+ "count": 154,
+ "num_samples": 2467,
+ "tasks": [],
+ "average_score": 0.46731791428406805
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 109,
+ "num_samples": 1910,
+ "tasks": [],
+ "average_score": 0.3406008235342885
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 51,
+ "num_samples": 855,
+ "tasks": [],
+ "average_score": 0.5572925295284307
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 15,
+ "num_samples": 245,
+ "tasks": [],
+ "average_score": 0.6902380952380953
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 77,
+ "num_samples": 1386,
+ "tasks": [],
+ "average_score": 0.4189154010048976
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 152,
+ "num_samples": 2437,
+ "tasks": [],
+ "average_score": 0.2943206715105082
+ },
+ "Planning and Decision Making": {
+ "count": 37,
+ "num_samples": 577,
+ "tasks": [],
+ "average_score": 0.19422793560945503
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 93,
+ "num_samples": 1517,
+ "tasks": [],
+ "average_score": 0.47202628409684394
+ },
+ "Text-Based Images and Documents": {
+ "count": 82,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.3624496929166193
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 101,
+ "num_samples": 1718,
+ "tasks": [],
+ "average_score": 0.38946844562183286
+ },
+ "Videos": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.45508480503584553
+ },
+ "Artistic and Creative Content": {
+ "count": 32,
+ "num_samples": 541,
+ "tasks": [],
+ "average_score": 0.47569921440672464
+ },
+ "Photographs": {
+ "count": 143,
+ "num_samples": 2248,
+ "tasks": [],
+ "average_score": 0.465175334092545
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 11,
+ "num_samples": 169,
+ "tasks": [],
+ "average_score": 0.29410984789062117
+ }
+ },
+ "output_format": {
+ "contextual_formatted_text": {
+ "count": 98,
+ "num_samples": 1514,
+ "tasks": [],
+ "average_score": 0.41242028190533997
+ },
+ "structured_output": {
+ "count": 110,
+ "num_samples": 1714,
+ "tasks": [],
+ "average_score": 0.3906415365938764
+ },
+ "exact_text": {
+ "count": 83,
+ "num_samples": 1278,
+ "tasks": [],
+ "average_score": 0.44244772638735347
+ },
+ "numerical_data": {
+ "count": 49,
+ "num_samples": 862,
+ "tasks": [],
+ "average_score": 0.3629944944697668
+ },
+ "open_ended_output": {
+ "count": 80,
+ "num_samples": 1454,
+ "tasks": [],
+ "average_score": 0.5713834131825314
+ },
+ "multiple_choice": {
+ "count": 85,
+ "num_samples": 1363,
+ "tasks": [],
+ "average_score": 0.39874839531459466
+ }
+ },
+ "input_num": {
+ "6-8 images": {
+ "count": 21,
+ "num_samples": 314,
+ "tasks": [],
+ "average_score": 0.3359977324263039
+ },
+ "9-image or more": {
+ "count": 41,
+ "num_samples": 623,
+ "tasks": [],
+ "average_score": 0.4305788513381019
+ },
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.46343334374251277
+ },
+ "video": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.45508480503584553
+ },
+ "4-5 images": {
+ "count": 34,
+ "num_samples": 520,
+ "tasks": [],
+ "average_score": 0.24651576711552803
+ },
+ "2-3 images": {
+ "count": 51,
+ "num_samples": 802,
+ "tasks": [],
+ "average_score": 0.36981497185070983
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 72,
+ "num_samples": 1124,
+ "tasks": [],
+ "average_score": 0.5666618234843734
+ },
+ "Planning": {
+ "count": 78,
+ "num_samples": 1239,
+ "tasks": [],
+ "average_score": 0.2420320329702607
+ },
+ "Coding": {
+ "count": 31,
+ "num_samples": 474,
+ "tasks": [],
+ "average_score": 0.3458483931206892
+ },
+ "Perception": {
+ "count": 145,
+ "num_samples": 2313,
+ "tasks": [],
+ "average_score": 0.43590838051817093
+ },
+ "Metrics": {
+ "count": 20,
+ "num_samples": 309,
+ "tasks": [],
+ "average_score": 0.5176671720617656
+ },
+ "Science": {
+ "count": 29,
+ "num_samples": 574,
+ "tasks": [],
+ "average_score": 0.3554299482098288
+ },
+ "Knowledge": {
+ "count": 97,
+ "num_samples": 1605,
+ "tasks": [],
+ "average_score": 0.5399167524341886
+ },
+ "Mathematics": {
+ "count": 33,
+ "num_samples": 547,
+ "tasks": [],
+ "average_score": 0.32918280841495845
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/Default/GPT_4o_mini/task_results.json b/static/eval_results/Default/GPT_4o_mini/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..ac1f7d6a1eed9c18f56a4275b0b839ca7f3f7d7b
--- /dev/null
+++ b/static/eval_results/Default/GPT_4o_mini/task_results.json
@@ -0,0 +1,7756 @@
+[
+ {
+ "name": "planning_visual_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.4404761904761905,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.02971437714058806,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_retrieval",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_content_based_retrieval_radiology",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.2653061224489796,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_location",
+ "score": 0.07878787878787878,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "rocks_samples_identify",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_correspondance_in_two_images",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "2d_image_jigsaw_puzzle_easy",
+ "score": 0.380952380952381,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_recommend_solutions_stackoverflow",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.6785714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "worldle",
+ "score": 0.35741427136457926,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "rebus",
+ "score": 0.30434782608695654,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "photoshop_operation",
+ "score": 0.24047619047619048,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.6470588235294118,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pokemon_3D_recognition",
+ "score": 0.43333333333333335,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "code_translation_Python",
+ "score": 0.3125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "Forensic_Detection_of_different_images",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.9119047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rocks_samples_compare",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_rating",
+ "score": 0.6370339174257883,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "functionality_matching_in_different_objects",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_recommendation",
+ "score": 0.7058673469387756,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "ancient_map_understanding",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.3348039215686274,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_view",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_translation_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_intent_recognition",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.49999999999999994,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_sentiment",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "video_grounding_temporal",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_event",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video_eval_dynamic_pref",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_segments_reordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "dish_ingredient_match",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_spatial",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_parasite_detection",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "knowledge_sign_recognition",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_semantics",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "logical_reasoning_2d_folding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_control",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.5857142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_mask",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_solution_compare",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video_camera_motion_description",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "video"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.6904761904761906,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_visualization_output_understanding",
+ "score": 0.3,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 10,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_word",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_motion_matching_3D_real",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_acceptance",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_homepage_profile",
+ "score": 0.18482142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.48571428571428577,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.43979842890651355,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_T2I_reward",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.36666666666666664,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "comic_page_ordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.7857142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.5294117647058824,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_action_recognition",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "constrained_generation_multi_contain_position_only",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_polyp_segmentation_single_object_rater",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_position_images",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.29292929292929293,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_xor_images",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.5882352941176472,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "recipe_image_ordering",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_keywords_based_retrieval_non_radiology",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "video_motion_matching_real_3D",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.22491496598639452,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "waldo",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.2505668934240363,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sign_language",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_basic",
+ "score": 0.7058823529411765,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.17982456140350878,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.048713528589567665,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_amazon",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.024564069093751337,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "dvqa",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.6020408163265306,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "IAM_line_ocr_and_locate",
+ "score": 0.6265067061623183,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "counting_multi_image",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logo2k_same_type_logo_retrieval",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.6480000000000001,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_retrieval",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_factual_pref",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "clevrer_moving_direction_video",
+ "score": 0.3125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "action_sequence",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.7010526315789474,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.43662631578947375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_check_two_images",
+ "score": 0.6444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.2631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_load_type_prediction_from_plot",
+ "score": 0.05952380952380952,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_object_existence_video",
+ "score": 0.625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vlnqa_egocentric_navigation_video",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.6955714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ball_cup_swap_3",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.457498007685276,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.5423192899685483,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sta_action_localization_video",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "coco_ood_global_image_retrieval_by_query_property",
+ "score": 0.6488095238095237,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_path",
+ "score": 0.21349206349206354,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.3777777777777777,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.6498716440678927,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "muma_theory_of_mind_belief_of_goal",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "nextqa_mc",
+ "score": 0.7368421052631579,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.5455714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_news_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.27142857142857146,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.6964285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vizwiz_quality_accessment_for_blind",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.7275263157894736,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.20833333333333334,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_3d_model_texturing",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.5535393001296958,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "semantic_matching_of_two_images",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_unmask",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arc_agi",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "constrained_generation_contain_contain_images",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_cycle",
+ "score": 0.20595238095238094,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "autorater_motion_guided_editing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "algebra",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.9505142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.9375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.6875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_aesthetics",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "places365_similar_scene_retrieval",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat_position_only_length",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_subject",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_video_moving_object_property_recognition",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "muma_theory_of_mind_social_goal",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_fragments_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "vln_english_next_step",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "emotion_recognition",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.1969956173950675,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_2D_views_of_3D_shapes",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "move_pos_to_pos_hanoi_4_pole",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "vln_identify_robot",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.24388210678357394,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.4166666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.29411764705882354,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.5789473684210527,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_multicamera_tracking_predict_bbox",
+ "score": 0.046820973422936174,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_retrieval",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_direction_understanding",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_video_action_count",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_easy",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visual_prediction_rater_novel_view_synthesis",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.8506493506493505,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_openable_part_segmentation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_retrieval",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "face_identity_matching",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "movie_info_retrieval",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "star_object_interaction_video",
+ "score": 0.6875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "cam_traj_to_video_selection",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_object_shuffle_video",
+ "score": 0.3125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_vehicle_multiview_counting",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_error_line_identification",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.6357142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.8769841269841271,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.9705882352941178,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_depth_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "av_human_multiview_counting",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "clevrer_video_moving_object_count",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 21,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cheapest_flight_identification",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "video_to_camera_trajectory_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.37142857142857144,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.8125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_instagram",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.11428571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_plane_segmentation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.41428571428571426,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_semantic_segmentation",
+ "score": 0.3958333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "game_info_retrieval",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_excel",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "photo_sharing_image_retrieval",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "perception_test_video_character_order",
+ "score": 0.8125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "visual_prediction_rater_panoptic_segmentation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_3d_assembled_quality_understanding",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "av_view_identification",
+ "score": 0.14444444444444443,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "code_programming_extremely_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "action_prediction",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "visual_prediction_rater_surface_normal_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "app_interactive_operations_zoom",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_twitter",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "geographic_remote_sensing_land_cover",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "app_interactive_operations_iphone_settings",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_ppt",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_leetcode",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_alipay",
+ "score": 0.47058823529411764,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_tiktok",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_youtube",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_notes",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "google_streetview_circle_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "code_translation_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_rating",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.43050085804176885,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_name",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_reasoning",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_line_reasoning",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "multiview_reasoning_camera_moving",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "Movie_retrieval_by_actor",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_retrieval_given_surgeon_activity",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Videos",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_abdomen_endscopy_organ_recognition",
+ "score": 0.09523809523809523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_match_problem",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "music_sheet_author",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.05714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.004214285714285663,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_abdomen_MRI_organ_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "planning_visual_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.7539682539682541,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_content_reasoning",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_hindi_next_step",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "visualdial_visual_dialog_image_guessing",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vln_tegulu_next_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "TV_show_retrieval_by_character",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_eval_visual_pref",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "clevr_arithmetic",
+ "score": 0.3157894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_line_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_output_result",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.10526315789473684,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "painting_QA",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_add_tag",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.2785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.6142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.8200000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "doc_vqa",
+ "score": 0.8375000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.7214285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.65,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_magic_video",
+ "score": 0.5733333333333334,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.6931034482758621,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.7615384615384616,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.875862068965517,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "wikihow_complex_task_completion",
+ "score": 0.7777777777777779,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.5206896551724138,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.8379310344827584,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.8857142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.872413793103448,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.7206896551724139,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.8482758620689654,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.817241379310345,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.8379310344827586,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.9000000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video2notes",
+ "score": 0.6785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.8210526315789474,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.7827586206896552,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.4548387096774193,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.6857142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_creative_video",
+ "score": 0.31333333333333335,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "generated_video_artifacts",
+ "score": 0.43749999999999994,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.5714285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_humor_video",
+ "score": 0.4066666666666666,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.5666666666666667,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_qa",
+ "score": 0.6714285714285716,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "sceneqa_scene_transition_video",
+ "score": 0.3,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.8950000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_B",
+ "score": 0.905263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8000000000000002,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_writing",
+ "score": 0.5466666666666666,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "video_detail_description",
+ "score": 0.4263157894736842,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "activitynetqa",
+ "score": 0.4157894736842105,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.8800000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.42857142857142855,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.14285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.05714285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.7750000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_summary",
+ "score": 0.7357142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.2071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.16428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_oe",
+ "score": 0.3368421052631579,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_short_title",
+ "score": 0.6428571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "red_teaming_visual_order_A",
+ "score": 0.905263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.5142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.3285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.33571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Arabic",
+ "score": 0.41428571428571426,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Russian",
+ "score": 0.4642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_French",
+ "score": 0.4857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.72,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.6,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.3559999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.6571428571428573,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.5142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Spanish",
+ "score": 0.4428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Chinese",
+ "score": 0.5285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "video_content_follow_up",
+ "score": 0.8642857142857145,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/Default/Gemini_1.5_flash_002/summary_results.json b/static/eval_results/Default/Gemini_1.5_flash_002/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..89748aec2730a7b4dd7c3fbdec0e71c34ad210d5
--- /dev/null
+++ b/static/eval_results/Default/Gemini_1.5_flash_002/summary_results.json
@@ -0,0 +1,251 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 440,
+ "num_eval_samples": 6539,
+ "macro_mean_score": 0.4189319021967416,
+ "micro_mean_score": 0.41567515414375245
+ },
+ "open": {
+ "num_eval_tasks": 65,
+ "num_eval_samples": 1163,
+ "macro_mean_score": 0.5691365176285039,
+ "micro_mean_score": 0.5987532244196045
+ },
+ "overall_score": 0.4382651695295427
+ },
+ "keyword_stats": {
+ "skills": {
+ "Object Recognition and Classification": {
+ "count": 303,
+ "num_samples": 4755,
+ "tasks": [],
+ "average_score": 0.46355333176347063
+ },
+ "Text Recognition (OCR)": {
+ "count": 137,
+ "num_samples": 2239,
+ "tasks": [],
+ "average_score": 0.4431807648811706
+ },
+ "Language Understanding and Generation": {
+ "count": 154,
+ "num_samples": 2509,
+ "tasks": [],
+ "average_score": 0.4975887290434539
+ },
+ "Scene and Event Understanding": {
+ "count": 154,
+ "num_samples": 2467,
+ "tasks": [],
+ "average_score": 0.49409642663278297
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 109,
+ "num_samples": 1910,
+ "tasks": [],
+ "average_score": 0.38033540105052427
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 51,
+ "num_samples": 855,
+ "tasks": [],
+ "average_score": 0.5621166766717235
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 15,
+ "num_samples": 245,
+ "tasks": [],
+ "average_score": 0.6570726817042606
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 77,
+ "num_samples": 1386,
+ "tasks": [],
+ "average_score": 0.4480877005302385
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 152,
+ "num_samples": 2437,
+ "tasks": [],
+ "average_score": 0.3338006749329557
+ },
+ "Planning and Decision Making": {
+ "count": 37,
+ "num_samples": 577,
+ "tasks": [],
+ "average_score": 0.16197013296986068
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 93,
+ "num_samples": 1517,
+ "tasks": [],
+ "average_score": 0.3971534837718938
+ },
+ "Text-Based Images and Documents": {
+ "count": 82,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.3448204918940882
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 101,
+ "num_samples": 1718,
+ "tasks": [],
+ "average_score": 0.43525833484767545
+ },
+ "Videos": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.4837362543956792
+ },
+ "Artistic and Creative Content": {
+ "count": 32,
+ "num_samples": 541,
+ "tasks": [],
+ "average_score": 0.5111257660425502
+ },
+ "Photographs": {
+ "count": 143,
+ "num_samples": 2248,
+ "tasks": [],
+ "average_score": 0.49366013155105076
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 11,
+ "num_samples": 169,
+ "tasks": [],
+ "average_score": 0.4001983820478609
+ }
+ },
+ "output_format": {
+ "contextual_formatted_text": {
+ "count": 98,
+ "num_samples": 1514,
+ "tasks": [],
+ "average_score": 0.386988040250785
+ },
+ "structured_output": {
+ "count": 110,
+ "num_samples": 1714,
+ "tasks": [],
+ "average_score": 0.3884226428206387
+ },
+ "exact_text": {
+ "count": 83,
+ "num_samples": 1278,
+ "tasks": [],
+ "average_score": 0.4425893080900246
+ },
+ "numerical_data": {
+ "count": 49,
+ "num_samples": 862,
+ "tasks": [],
+ "average_score": 0.42223626366392253
+ },
+ "open_ended_output": {
+ "count": 80,
+ "num_samples": 1454,
+ "tasks": [],
+ "average_score": 0.5390305634303021
+ },
+ "multiple_choice": {
+ "count": 85,
+ "num_samples": 1363,
+ "tasks": [],
+ "average_score": 0.472066557554629
+ }
+ },
+ "input_num": {
+ "6-8 images": {
+ "count": 21,
+ "num_samples": 314,
+ "tasks": [],
+ "average_score": 0.3666950113378685
+ },
+ "9-image or more": {
+ "count": 41,
+ "num_samples": 623,
+ "tasks": [],
+ "average_score": 0.44571360028283974
+ },
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.45400479933257654
+ },
+ "video": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.4837362543956792
+ },
+ "4-5 images": {
+ "count": 34,
+ "num_samples": 520,
+ "tasks": [],
+ "average_score": 0.35161402777057993
+ },
+ "2-3 images": {
+ "count": 51,
+ "num_samples": 802,
+ "tasks": [],
+ "average_score": 0.3839609821519984
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 72,
+ "num_samples": 1124,
+ "tasks": [],
+ "average_score": 0.4822341581959653
+ },
+ "Planning": {
+ "count": 78,
+ "num_samples": 1239,
+ "tasks": [],
+ "average_score": 0.26434115361219657
+ },
+ "Coding": {
+ "count": 31,
+ "num_samples": 474,
+ "tasks": [],
+ "average_score": 0.3677547363031234
+ },
+ "Perception": {
+ "count": 145,
+ "num_samples": 2313,
+ "tasks": [],
+ "average_score": 0.4640301382180305
+ },
+ "Metrics": {
+ "count": 20,
+ "num_samples": 309,
+ "tasks": [],
+ "average_score": 0.5348199655361041
+ },
+ "Science": {
+ "count": 29,
+ "num_samples": 574,
+ "tasks": [],
+ "average_score": 0.4890240042560499
+ },
+ "Knowledge": {
+ "count": 97,
+ "num_samples": 1605,
+ "tasks": [],
+ "average_score": 0.5126038207415967
+ },
+ "Mathematics": {
+ "count": 33,
+ "num_samples": 547,
+ "tasks": [],
+ "average_score": 0.384818434165593
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/Default/Gemini_1.5_flash_002/task_results.json b/static/eval_results/Default/Gemini_1.5_flash_002/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..584c3f67d4d2e17e705f05ab9648442c90fd96e8
--- /dev/null
+++ b/static/eval_results/Default/Gemini_1.5_flash_002/task_results.json
@@ -0,0 +1,7756 @@
+[
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.3095238095238095,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.6785714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.8888888888888888,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.4068877551020408,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.3469387755102041,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.03886509470801488,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_event",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_content_based_retrieval_radiology",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "dish_ingredient_match",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "code_retrieval",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_view",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rocks_samples_identify",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "counting",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_parasite_detection",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "2d_image_jigsaw_puzzle_easy",
+ "score": 0.14999999999999997,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_sign_recognition",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_recommendation",
+ "score": 0.6203514739229025,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "visual_correspondance_in_two_images",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "rocks_samples_compare",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "pokemon_3D_recognition",
+ "score": 0.8333333333333334,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "worldle",
+ "score": 0.35558727927939476,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.8421052631578947,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ancient_map_understanding",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.6785714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_Python",
+ "score": 0.41666666666666663,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.6470588235294118,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.22271751659129607,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.5476190476190476,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "comic_page_ordering",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_rating",
+ "score": 0.7558635964363686,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.47990196078431374,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.2727272727272727,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_dynamic_pref",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "video_eval_factual_pref",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "vln_identify_location",
+ "score": 0.2303030303030303,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "video_segments_reordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.3100359127375053,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_name",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "code_solution_compare",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "logical_reasoning_2d_folding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_blocksworld",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video_action_recognition",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "code_visualization_output_understanding",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 10,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "autorater_3d_model_texturing",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_line_reasoning",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "medical_polyp_segmentation_single_object_rater",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "media_recommend_solutions_stackoverflow",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_keywords_based_retrieval_non_radiology",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiview_reasoning_camera_moving",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_basic",
+ "score": 0.47058823529411764,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.0319296239070534,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_spatial",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "Movie_retrieval_by_actor",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "autorater_unmask",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "recipe_image_ordering",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_motion_guided_editing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "google_streetview_circle_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "photoshop_operation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_semantics",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "counting_multi_image",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_intent_recognition",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_sentiment",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "video_eval_visual_pref",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "autorater_aesthetics",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_control",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.9404761904761905,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_grippers",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "semantic_matching_of_two_images",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_word",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "booking_web_rating",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_subject",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "app_interactive_operations_amazon",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "sign_language",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "media_homepage_profile",
+ "score": 0.21282182729551152,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_mask",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video_camera_motion_description",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "video"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.15,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_instagram",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_reasoning",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_acceptance",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "functionality_matching_in_different_objects",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_zoom",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "multilingual_news_qa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.49714178831993683,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.7261904761904762,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_iphone_settings",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.30612244897959184,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.23529411764705882,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_leetcode",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.09826063389901919,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_retrieval",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "dvqa",
+ "score": 0.9473684210526315,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arc_agi",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.5408163265306122,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_tiktok",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.5916519873131821,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_youtube",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "IAM_line_ocr_and_locate",
+ "score": 0.6702481953279147,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.6785714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_position_only",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_notes",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logo2k_same_type_logo_retrieval",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "waldo",
+ "score": 0.0002062628914307136,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualdial_visual_dialog_image_guessing",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.2894736842105263,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevr_arithmetic",
+ "score": 0.2631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.5600000000000002,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_cycle",
+ "score": 0.5089285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.746390336033466,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.3157894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.9621285714285712,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "algebra",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.7057894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_similar_scene_retrieval",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_retrieval",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_ood_global_image_retrieval_by_query_property",
+ "score": 0.681547619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "action_sequence",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_moving_direction_video",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "multi_load_type_prediction_from_plot",
+ "score": 0.5357142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "landmark_check_two_images",
+ "score": 0.7555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "av_vehicle_multiview_counting",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.5798723155227672,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.9017526315789473,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.7478991596638657,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_video_moving_object_property_recognition",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.5555555555555555,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_fragments_understanding",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_object_existence_video",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.570486129111546,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.7672857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vlnqa_egocentric_navigation_video",
+ "score": 0.3125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.21428571428571433,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.7220526315789474,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.24564101770091742,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_multicamera_tracking_predict_bbox",
+ "score": 0.0017402394162957552,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.11578571428571437,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.7727272727272726,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_human_multiview_counting",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "game_info_retrieval",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "emotion_recognition",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "visual_prediction_rater_novel_view_synthesis",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_view_identification",
+ "score": 0.2333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "visual_prediction_rater_openable_part_segmentation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "cheapest_flight_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_depth_estimation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_plane_segmentation",
+ "score": 0.28888888888888886,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_semantic_segmentation",
+ "score": 0.39583333333333326,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "video_to_camera_trajectory_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "visual_prediction_rater_panoptic_segmentation",
+ "score": 0.2619047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.9375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_3d_assembled_quality_understanding",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_english_next_step",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_motion_matching_real_3D",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "logical_reasoning_2D_views_of_3D_shapes",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "move_pos_to_pos_hanoi_4_pole",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "cam_traj_to_video_selection",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "photo_sharing_image_retrieval",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "visual_prediction_rater_surface_normal_estimation",
+ "score": 0.33333333333333337,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geographic_remote_sensing_land_cover",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.7539682539682538,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sta_action_localization_video",
+ "score": 0.3125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "google_streetview_direction_understanding",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_easy",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_match_problem",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.589357142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "muma_theory_of_mind_belief_of_goal",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_video_action_count",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.84375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_hard",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "google_streetview_line_sorting",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "planning_visual_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "perception_test_object_shuffle_video",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.3928571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_video_character_order",
+ "score": 0.875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "clevrer_video_moving_object_count",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 21,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "star_object_interaction_video",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.19999999999999998,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.15714285714285717,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_error_line_identification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.4714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "muma_theory_of_mind_social_goal",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_extremely_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.24492301011444534,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_retrieval",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "face_identity_matching",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "action_prediction",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "nextqa_mc",
+ "score": 0.8947368421052632,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_retrieval",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_content_reasoning",
+ "score": 0.8888888888888888,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.753968253968254,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rebus",
+ "score": 0.30434782608695654,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Forensic_Detection_of_different_images",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "video_grounding_temporal",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_robot",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_excel",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat_position_only_length",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_images",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_xor_images",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_contain_images",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "vizwiz_quality_accessment_for_blind",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "ball_cup_swap_3",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vln_hindi_next_step",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_motion_matching_3D_real",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.07619047619047618,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TV_show_retrieval_by_character",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_author",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_abdomen_MRI_organ_recognition",
+ "score": 0.27380952380952384,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_twitter",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_ppt",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.35000000000000003,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_alipay",
+ "score": 0.4117647058823529,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_retrieval_given_surgeon_activity",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Videos",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.3137755102040816,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.2828798185941043,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6000000000000001,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_abdomen_endscopy_organ_recognition",
+ "score": 0.09523809523809523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "reward_models_T2I_reward",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "graph_hamiltonian_path",
+ "score": 0.3654761904761905,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "vln_tegulu_next_step",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.6785714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_output_result",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "painting_QA",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_add_tag",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.7666666666666668,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.8571428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "generated_video_artifacts",
+ "score": 0.35624999999999996,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.5928571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "wikihow_complex_task_completion",
+ "score": 0.8222222222222222,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "sceneqa_scene_transition_video",
+ "score": 0.3,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_summary",
+ "score": 0.6642857142857144,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "funqa_unexpected_action_magic_video",
+ "score": 0.5199999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_detail_description",
+ "score": 0.563157894736842,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "paper_review_writing",
+ "score": 0.6199999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "funqa_unexpected_action_creative_video",
+ "score": 0.3466666666666667,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.8263157894736842,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.6214285714285716,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "activitynetqa",
+ "score": 0.5052631578947369,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.45000000000000007,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.6071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_qa",
+ "score": 0.8571428571428573,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.9068965517241379,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "doc_vqa",
+ "score": 0.8187500000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.7642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.8533333333333335,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_oe",
+ "score": 0.32105263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.789655172413793,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.8551724137931035,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.7758620689655171,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.6310344827586206,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Spanish",
+ "score": 0.3428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Russian",
+ "score": 0.24285714285714283,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Arabic",
+ "score": 0.34285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Chinese",
+ "score": 0.3142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "funqa_unexpected_action_humor_video",
+ "score": 0.39333333333333337,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_French",
+ "score": 0.36428571428571427,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.3071428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.8137931034482758,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.1857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.22142857142857145,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.29999999999999993,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.8448275862068967,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.33571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.8285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.2785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.5714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.47857142857142865,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.8300000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.8500000000000002,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.6884615384615385,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_short_title",
+ "score": 0.6642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_content_follow_up",
+ "score": 0.8214285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.3806451612903227,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.705,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.24285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.5214285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.7850000000000004,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8789473684210528,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.15,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.32142857142857134,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video2notes",
+ "score": 0.7071428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.6482758620689654,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.8689655172413793,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_B",
+ "score": 0.9,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.7068965517241379,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_A",
+ "score": 0.9,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.1642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/Default/Gemini_1.5_pro_002/summary_results.json b/static/eval_results/Default/Gemini_1.5_pro_002/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..0710bf3e0c920cb7b8109b90e9bcbdfba2792418
--- /dev/null
+++ b/static/eval_results/Default/Gemini_1.5_pro_002/summary_results.json
@@ -0,0 +1,251 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 440,
+ "num_eval_samples": 6539,
+ "macro_mean_score": 0.4822473962867704,
+ "micro_mean_score": 0.4764805563057179
+ },
+ "open": {
+ "num_eval_tasks": 65,
+ "num_eval_samples": 1163,
+ "macro_mean_score": 0.5858190649927173,
+ "micro_mean_score": 0.6104901117798793
+ },
+ "overall_score": 0.4955784031499121
+ },
+ "keyword_stats": {
+ "skills": {
+ "Object Recognition and Classification": {
+ "count": 303,
+ "num_samples": 4755,
+ "tasks": [],
+ "average_score": 0.5202055934299538
+ },
+ "Text Recognition (OCR)": {
+ "count": 137,
+ "num_samples": 2239,
+ "tasks": [],
+ "average_score": 0.5017043129027509
+ },
+ "Language Understanding and Generation": {
+ "count": 154,
+ "num_samples": 2509,
+ "tasks": [],
+ "average_score": 0.5532599716027446
+ },
+ "Scene and Event Understanding": {
+ "count": 154,
+ "num_samples": 2467,
+ "tasks": [],
+ "average_score": 0.546753787203128
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 109,
+ "num_samples": 1910,
+ "tasks": [],
+ "average_score": 0.425969084163906
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 51,
+ "num_samples": 855,
+ "tasks": [],
+ "average_score": 0.5751012914154264
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 15,
+ "num_samples": 245,
+ "tasks": [],
+ "average_score": 0.6982330827067671
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 77,
+ "num_samples": 1386,
+ "tasks": [],
+ "average_score": 0.513647745999633
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 152,
+ "num_samples": 2437,
+ "tasks": [],
+ "average_score": 0.3845337030093212
+ },
+ "Planning and Decision Making": {
+ "count": 37,
+ "num_samples": 577,
+ "tasks": [],
+ "average_score": 0.23899503258223884
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 93,
+ "num_samples": 1517,
+ "tasks": [],
+ "average_score": 0.4625032188638111
+ },
+ "Text-Based Images and Documents": {
+ "count": 82,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.4292353723689881
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 101,
+ "num_samples": 1718,
+ "tasks": [],
+ "average_score": 0.4869625906903554
+ },
+ "Videos": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.5028718355967439
+ },
+ "Artistic and Creative Content": {
+ "count": 32,
+ "num_samples": 541,
+ "tasks": [],
+ "average_score": 0.5584779204331461
+ },
+ "Photographs": {
+ "count": 143,
+ "num_samples": 2248,
+ "tasks": [],
+ "average_score": 0.55005349042813
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 11,
+ "num_samples": 169,
+ "tasks": [],
+ "average_score": 0.4292127751495457
+ }
+ },
+ "output_format": {
+ "contextual_formatted_text": {
+ "count": 98,
+ "num_samples": 1514,
+ "tasks": [],
+ "average_score": 0.44896309957892694
+ },
+ "structured_output": {
+ "count": 110,
+ "num_samples": 1714,
+ "tasks": [],
+ "average_score": 0.44418591808616864
+ },
+ "exact_text": {
+ "count": 83,
+ "num_samples": 1278,
+ "tasks": [],
+ "average_score": 0.5146447350354234
+ },
+ "numerical_data": {
+ "count": 49,
+ "num_samples": 862,
+ "tasks": [],
+ "average_score": 0.4688623462674191
+ },
+ "open_ended_output": {
+ "count": 80,
+ "num_samples": 1454,
+ "tasks": [],
+ "average_score": 0.5580414823700747
+ },
+ "multiple_choice": {
+ "count": 85,
+ "num_samples": 1363,
+ "tasks": [],
+ "average_score": 0.5538255562099124
+ }
+ },
+ "input_num": {
+ "6-8 images": {
+ "count": 21,
+ "num_samples": 314,
+ "tasks": [],
+ "average_score": 0.39066515495086923
+ },
+ "9-image or more": {
+ "count": 41,
+ "num_samples": 623,
+ "tasks": [],
+ "average_score": 0.5370278962809547
+ },
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.5034399620483027
+ },
+ "video": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.5028718355967439
+ },
+ "4-5 images": {
+ "count": 34,
+ "num_samples": 520,
+ "tasks": [],
+ "average_score": 0.4885398161821004
+ },
+ "2-3 images": {
+ "count": 51,
+ "num_samples": 802,
+ "tasks": [],
+ "average_score": 0.45544217378728585
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 72,
+ "num_samples": 1124,
+ "tasks": [],
+ "average_score": 0.5421439953094952
+ },
+ "Planning": {
+ "count": 78,
+ "num_samples": 1239,
+ "tasks": [],
+ "average_score": 0.3335324339429373
+ },
+ "Coding": {
+ "count": 31,
+ "num_samples": 474,
+ "tasks": [],
+ "average_score": 0.43465181771633377
+ },
+ "Perception": {
+ "count": 145,
+ "num_samples": 2313,
+ "tasks": [],
+ "average_score": 0.5250631828331306
+ },
+ "Metrics": {
+ "count": 20,
+ "num_samples": 309,
+ "tasks": [],
+ "average_score": 0.5821004797173627
+ },
+ "Science": {
+ "count": 29,
+ "num_samples": 574,
+ "tasks": [],
+ "average_score": 0.5124355410095621
+ },
+ "Knowledge": {
+ "count": 97,
+ "num_samples": 1605,
+ "tasks": [],
+ "average_score": 0.5722329455291694
+ },
+ "Mathematics": {
+ "count": 33,
+ "num_samples": 547,
+ "tasks": [],
+ "average_score": 0.41210885517904977
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/Default/Gemini_1.5_pro_002/task_results.json b/static/eval_results/Default/Gemini_1.5_pro_002/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..97f780571a4c423de0ecfd5d6157ba715845c8f2
--- /dev/null
+++ b/static/eval_results/Default/Gemini_1.5_pro_002/task_results.json
@@ -0,0 +1,7756 @@
+[
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.6199454600186646,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_robot",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.6470588235294118,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "dish_ingredient_match",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "code_translation_easy",
+ "score": 0.34523809523809523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_english_next_step",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TV_show_retrieval_by_character",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "multiview_reasoning_camera_moving",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "booking_web_recommendation",
+ "score": 0.6772108843537415,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.11764705882352941,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_author",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_visual_pref",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "code_match_problem",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.15789473684210525,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sign_language",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_Python",
+ "score": 0.4583333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "2d_image_jigsaw_puzzle_easy",
+ "score": 0.21309523809523806,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_translation_advanced",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_basic",
+ "score": 0.7647058823529411,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_sign_recognition",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "google_streetview_line_sorting",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_reasoning",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_location",
+ "score": 0.3939393939393939,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.4119942575491687,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.45,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_content_based_retrieval_radiology",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "photoshop_operation",
+ "score": 0.15952380952380954,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "media_homepage_profile",
+ "score": 0.3056838524883637,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.29292929292929293,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_news_qa",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_event",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rocks_samples_identify",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_rating",
+ "score": 0.8928571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_solution_compare",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video_grounding_spatial",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.31428571428571433,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "functionality_matching_in_different_objects",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "move_pos_to_pos_hanoi_4_pole",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_barman",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visual_correspondance_in_two_images",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.4699566675933124,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_polyp_segmentation_single_object_rater",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.33035714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.4656862745098039,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_segments_reordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.9642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.7777777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_sentiment",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "google_streetview_direction_understanding",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_blocksworld",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "media_recommend_solutions_stackoverflow",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pokemon_3D_recognition",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "paper_review_rating",
+ "score": 0.7093310229186855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.06762834530316385,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "worldle",
+ "score": 0.4497384340940744,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_abdomen_MRI_organ_recognition",
+ "score": 0.29166666666666663,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_keywords_based_retrieval_non_radiology",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_abdomen_endscopy_organ_recognition",
+ "score": 0.35119047619047616,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "Forensic_Detection_of_different_images",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "ancient_map_understanding",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_acceptance",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "counting_multi_image",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "rocks_samples_compare",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_2D_views_of_3D_shapes",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "video_action_recognition",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0035714285714285718,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_line_reasoning",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.3877551020408163,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_tegulu_next_step",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_parasite_detection",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.8025210084033615,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.831857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_visualization_output_understanding",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 10,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "google_streetview_circle_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.09619047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_moving_direction_video",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "Movie_retrieval_by_actor",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "semantic_matching_of_two_images",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "rebus",
+ "score": 0.391304347826087,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_view",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_camera_motion_description",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "video"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_fragments_understanding",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.03864007436439077,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vizwiz_quality_accessment_for_blind",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "comic_page_ordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.6723157894736841,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_retrieval_given_surgeon_activity",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Videos",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.30612244897959184,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_temporal",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "planning_visual_grippers",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "muma_theory_of_mind_belief_of_goal",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_eval_dynamic_pref",
+ "score": 0.8125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_ood_global_image_retrieval_by_query_property",
+ "score": 0.6726190476190476,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.30454267975765786,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.5510204081632654,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_intent_recognition",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "weather_info_retrieval",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "logical_reasoning_2d_folding",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "cheapest_flight_identification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.668,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.7153571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vlnqa_egocentric_navigation_video",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.48214285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_hindi_next_step",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_name",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "video_eval_factual_pref",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "video_to_camera_trajectory_retrieval",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "code_programming_test_hard",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "code_retrieval",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.48214285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.7777777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_sequence",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "recipe_image_ordering",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.9523809523809523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.6842105263157895,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.5238095238095237,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_human_multiview_counting",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_object_shuffle_video",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_vehicle_multiview_counting",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_object_existence_video",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "face_identity_matching",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "arc_agi",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.84375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "IAM_line_ocr_and_locate",
+ "score": 0.7099528290771637,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "emotion_recognition",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "places365_similar_scene_retrieval",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "waldo",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.14711083476825218,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.7460317460317462,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_load_type_prediction_from_plot",
+ "score": 0.5595238095238094,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.6758816417011395,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_retrieval",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.9375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.82,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.8486368421052632,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_retrieval",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "logo2k_same_type_logo_retrieval",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "algebra",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevr_arithmetic",
+ "score": 0.3684210526315789,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualdial_visual_dialog_image_guessing",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_video_moving_object_count",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 21,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "landmark_check_two_images",
+ "score": 0.8222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.555696767990635,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "muma_theory_of_mind_social_goal",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_path",
+ "score": 0.2482142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.42105263157894735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.4666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.7089473684210525,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.8174603174603176,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.3857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.8246753246753247,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_video_action_count",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "sta_action_localization_video",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "cam_traj_to_video_selection",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.66869355335515,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "star_object_interaction_video",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "clevrer_video_moving_object_property_recognition",
+ "score": 0.625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_retrieval",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_content_reasoning",
+ "score": 0.7777777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.9712357142857144,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_error_line_identification",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_multicamera_tracking_predict_bbox",
+ "score": 0.016738273048656067,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.32509082865144884,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_T2I_reward",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "photo_sharing_image_retrieval",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "nextqa_mc",
+ "score": 0.8421052631578947,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.2693571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "dvqa",
+ "score": 0.7368421052631579,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_cycle",
+ "score": 0.3803571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "constrained_generation_contain_contain_images",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_retrieval",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.8333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat_position_only_length",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_zoom",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_word",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "constrained_generation_contain_position_images",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "app_interactive_operations_tiktok",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.7357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_prediction",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "constrained_generation_xor_images",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.35,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ball_cup_swap_3",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_3d_model_texturing",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_aesthetics",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_position_only",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_subject",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_view_identification",
+ "score": 0.2777777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_motion_guided_editing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_unmask",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_semantics",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_leetcode",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "autorater_mask",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "app_interactive_operations_instagram",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "code_programming_test_advanced",
+ "score": 0.05555555555555555,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_control",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "app_interactive_operations_iphone_settings",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_ppt",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_notes",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_amazon",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_excel",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_youtube",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_twitter",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_alipay",
+ "score": 0.4117647058823529,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "code_programming_extremely_hard",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.35294117647058826,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.36734693877551017,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.40232426303854874,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6117647058823531,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_video_character_order",
+ "score": 0.9375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.5945319390969315,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_motion_matching_3D_real",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "video_motion_matching_real_3D",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "visual_prediction_rater_depth_estimation",
+ "score": 0.6666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_plane_segmentation",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_openable_part_segmentation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_panoptic_segmentation",
+ "score": 0.49999999999999994,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_surface_normal_estimation",
+ "score": 0.8095238095238094,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_3d_assembled_quality_understanding",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visual_prediction_rater_novel_view_synthesis",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_semantic_segmentation",
+ "score": 0.5416666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.16666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.4142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.5142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.6285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.3714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geographic_remote_sensing_land_cover",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_output_result",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.3157894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "painting_QA",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_add_tag",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "generated_video_artifacts",
+ "score": 0.36875,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.14285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.5714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.7111111111111111,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.9,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "wikihow_complex_task_completion",
+ "score": 0.788888888888889,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "sceneqa_scene_transition_video",
+ "score": 0.35714285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_summary",
+ "score": 0.7,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "funqa_unexpected_action_magic_video",
+ "score": 0.5666666666666667,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_detail_description",
+ "score": 0.594736842105263,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "paper_review_writing",
+ "score": 0.6133333333333333,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "funqa_unexpected_action_creative_video",
+ "score": 0.41999999999999993,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.8263157894736843,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.5357142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "activitynetqa",
+ "score": 0.4473684210526316,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.48,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.692857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_qa",
+ "score": 0.835714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.8896551724137929,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "doc_vqa",
+ "score": 0.80625,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.7571428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.8,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_oe",
+ "score": 0.35263157894736835,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.8241379310344826,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.8206896551724137,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.8103448275862067,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_content_follow_up",
+ "score": 0.8500000000000002,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.42580645161290337,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.6,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.8758620689655172,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.5714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.6655172413793102,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_short_title",
+ "score": 0.6357142857142856,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.6000000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_humor_video",
+ "score": 0.3933333333333333,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.8137931034482755,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.7965517241379312,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.8857142857142859,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.5142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.5285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.82,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.7750000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_B",
+ "score": 0.905263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video2notes",
+ "score": 0.7142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.74,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.2928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_A",
+ "score": 0.905263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.6346153846153848,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Spanish",
+ "score": 0.4,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Russian",
+ "score": 0.48571428571428577,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.765,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Arabic",
+ "score": 0.42142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8842105263157897,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Chinese",
+ "score": 0.6428571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.21428571428571433,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_French",
+ "score": 0.45000000000000007,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.35000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.22142857142857145,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.15000000000000005,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.3428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.22142857142857147,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.3000000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.23571428571428577,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.33571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/Default/Idefics3/summary_results.json b/static/eval_results/Default/Idefics3/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..ce15d5ce2e9339df0bcb8985694132ef9048c00a
--- /dev/null
+++ b/static/eval_results/Default/Idefics3/summary_results.json
@@ -0,0 +1,251 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 440,
+ "num_eval_samples": 6539,
+ "macro_mean_score": 0.08956972487602757,
+ "micro_mean_score": 0.08982225274252693
+ },
+ "open": {
+ "num_eval_tasks": 65,
+ "num_eval_samples": 1163,
+ "macro_mean_score": 0.3210866162255635,
+ "micro_mean_score": 0.35649183147033553
+ },
+ "overall_score": 0.11936892871309657
+ },
+ "keyword_stats": {
+ "skills": {
+ "Object Recognition and Classification": {
+ "count": 303,
+ "num_samples": 4755,
+ "tasks": [],
+ "average_score": 0.123378776179585
+ },
+ "Text Recognition (OCR)": {
+ "count": 137,
+ "num_samples": 2239,
+ "tasks": [],
+ "average_score": 0.09602065544451607
+ },
+ "Language Understanding and Generation": {
+ "count": 154,
+ "num_samples": 2509,
+ "tasks": [],
+ "average_score": 0.1661543932339007
+ },
+ "Scene and Event Understanding": {
+ "count": 154,
+ "num_samples": 2467,
+ "tasks": [],
+ "average_score": 0.13018902877020821
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 109,
+ "num_samples": 1910,
+ "tasks": [],
+ "average_score": 0.11200133210641629
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 51,
+ "num_samples": 855,
+ "tasks": [],
+ "average_score": 0.1837120314657304
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 15,
+ "num_samples": 245,
+ "tasks": [],
+ "average_score": 0.2364085213032582
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 77,
+ "num_samples": 1386,
+ "tasks": [],
+ "average_score": 0.15239546294916975
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 152,
+ "num_samples": 2437,
+ "tasks": [],
+ "average_score": 0.08255834173646705
+ },
+ "Planning and Decision Making": {
+ "count": 37,
+ "num_samples": 577,
+ "tasks": [],
+ "average_score": 0.03149369112824262
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 93,
+ "num_samples": 1517,
+ "tasks": [],
+ "average_score": 0.06151607584357764
+ },
+ "Text-Based Images and Documents": {
+ "count": 82,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.10124344675801887
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 101,
+ "num_samples": 1718,
+ "tasks": [],
+ "average_score": 0.14147248511867794
+ },
+ "Videos": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.15942387460900312
+ },
+ "Artistic and Creative Content": {
+ "count": 32,
+ "num_samples": 541,
+ "tasks": [],
+ "average_score": 0.17458268378399872
+ },
+ "Photographs": {
+ "count": 143,
+ "num_samples": 2248,
+ "tasks": [],
+ "average_score": 0.13442937440893113
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 11,
+ "num_samples": 169,
+ "tasks": [],
+ "average_score": 0.02766884416043467
+ }
+ },
+ "output_format": {
+ "contextual_formatted_text": {
+ "count": 98,
+ "num_samples": 1514,
+ "tasks": [],
+ "average_score": 0.15513016850044997
+ },
+ "structured_output": {
+ "count": 110,
+ "num_samples": 1714,
+ "tasks": [],
+ "average_score": 0.03757596375966502
+ },
+ "exact_text": {
+ "count": 83,
+ "num_samples": 1278,
+ "tasks": [],
+ "average_score": 0.05386631116442094
+ },
+ "numerical_data": {
+ "count": 49,
+ "num_samples": 862,
+ "tasks": [],
+ "average_score": 0.0760949224506388
+ },
+ "open_ended_output": {
+ "count": 80,
+ "num_samples": 1454,
+ "tasks": [],
+ "average_score": 0.2987797010800956
+ },
+ "multiple_choice": {
+ "count": 85,
+ "num_samples": 1363,
+ "tasks": [],
+ "average_score": 0.10403841600436024
+ }
+ },
+ "input_num": {
+ "6-8 images": {
+ "count": 21,
+ "num_samples": 314,
+ "tasks": [],
+ "average_score": 0.0661753590325019
+ },
+ "9-image or more": {
+ "count": 41,
+ "num_samples": 623,
+ "tasks": [],
+ "average_score": 0.09190674791720088
+ },
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.12345439179884048
+ },
+ "video": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.15942387460900312
+ },
+ "4-5 images": {
+ "count": 34,
+ "num_samples": 520,
+ "tasks": [],
+ "average_score": 0.11382786944230487
+ },
+ "2-3 images": {
+ "count": 51,
+ "num_samples": 802,
+ "tasks": [],
+ "average_score": 0.10803808254834846
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 72,
+ "num_samples": 1124,
+ "tasks": [],
+ "average_score": 0.11450308988278819
+ },
+ "Planning": {
+ "count": 78,
+ "num_samples": 1239,
+ "tasks": [],
+ "average_score": 0.04671278220005028
+ },
+ "Coding": {
+ "count": 31,
+ "num_samples": 474,
+ "tasks": [],
+ "average_score": 0.0978814644137225
+ },
+ "Perception": {
+ "count": 145,
+ "num_samples": 2313,
+ "tasks": [],
+ "average_score": 0.13283830731528018
+ },
+ "Metrics": {
+ "count": 20,
+ "num_samples": 309,
+ "tasks": [],
+ "average_score": 0.09697463995668018
+ },
+ "Science": {
+ "count": 29,
+ "num_samples": 574,
+ "tasks": [],
+ "average_score": 0.1840497279921703
+ },
+ "Knowledge": {
+ "count": 97,
+ "num_samples": 1605,
+ "tasks": [],
+ "average_score": 0.1605667124060194
+ },
+ "Mathematics": {
+ "count": 33,
+ "num_samples": 547,
+ "tasks": [],
+ "average_score": 0.09835465288235297
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/Default/Idefics3/task_results.json b/static/eval_results/Default/Idefics3/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..459293d139ef01abb92c3847e7553a82abf43617
--- /dev/null
+++ b/static/eval_results/Default/Idefics3/task_results.json
@@ -0,0 +1,7756 @@
+[
+ {
+ "name": "face_identity_matching",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "IAM_line_ocr_and_locate",
+ "score": 0.09799690552820609,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_multicamera_tracking_predict_bbox",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevr_arithmetic",
+ "score": 0.05263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualdial_visual_dialog_image_guessing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_object_existence_video",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.02040816326530612,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "star_object_interaction_video",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_content_reasoning",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_similar_scene_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cam_traj_to_video_selection",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "photo_sharing_image_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "perception_test_video_character_order",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "coco_ood_global_image_retrieval_by_query_property",
+ "score": 0.37186147186147184,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "arc_agi",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "vizwiz_quality_accessment_for_blind",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "av_human_multiview_counting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "emotion_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 5.419227899761125e-10,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.043853084084109095,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "dvqa",
+ "score": 0.3157894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_moving_direction_video",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.045454545454545456,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_cycle",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "av_vehicle_multiview_counting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sta_action_localization_video",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.6359252430381498,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_object_shuffle_video",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "video_to_camera_trajectory_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "cheapest_flight_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "muma_theory_of_mind_social_goal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "clevrer_video_moving_object_property_recognition",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "action_sequence",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "game_info_retrieval",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "av_view_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "code_programming_test_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "muma_theory_of_mind_belief_of_goal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "code_programming_extremely_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.5789473684210527,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_fragments_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.09947368421052631,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.11904761904761907,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vlnqa_egocentric_navigation_video",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "landmark_check_two_images",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.022894736842105266,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_load_type_prediction_from_plot",
+ "score": 0.10714285714285712,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "code_error_line_identification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.017857142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_mc",
+ "score": 0.6842105263157895,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_retrieval",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_video_moving_object_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 21,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "algebra",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.005857142857142854,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logo2k_same_type_logo_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_path",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_video_action_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geographic_remote_sensing_land_cover",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.16666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.028571428571428574,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.09999999999999999,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.014285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_plane_segmentation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_novel_view_synthesis",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_3d_assembled_quality_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visual_prediction_rater_semantic_segmentation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_panoptic_segmentation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_surface_normal_estimation",
+ "score": 0.23809523809523808,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_depth_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_openable_part_segmentation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_abdomen_MRI_organ_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_correspondance_in_two_images",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_circle_reasoning",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vln_tegulu_next_step",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_factual_pref",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_polyp_segmentation_single_object_rater",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "recipe_image_ordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_visualization_output_understanding",
+ "score": 0.1,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 10,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_view",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video_grounding_temporal",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.018367346938775512,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_english_next_step",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "logical_reasoning_2d_folding",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_abdomen_endscopy_organ_recognition",
+ "score": 0.047619047619047616,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "worldle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_keywords_based_retrieval_non_radiology",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sign_language",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_camera_motion_description",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_segments_reordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.5476190476190477,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_spatial",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_match_problem",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "music_sheet_sentiment",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.04081632653061224,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_visual_pref",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.06933814569716257,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.05,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "semantic_matching_of_two_images",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.02040816326530612,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_recommendation",
+ "score": 0.36377551020408166,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "photoshop_operation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_homepage_profile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vln_hindi_next_step",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "media_recommend_solutions_stackoverflow",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "dish_ingredient_match",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "medical_retrieval_given_surgeon_activity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Videos",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "google_streetview_direction_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "music_sheet_name",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "code_translation_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ancient_map_understanding",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_2D_views_of_3D_shapes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "paper_review_acceptance",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "rocks_samples_compare",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_line_reasoning",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_intent_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "counting",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_solution_compare",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "2d_image_jigsaw_puzzle_easy",
+ "score": 0.017857142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_content_based_retrieval_radiology",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.15151515151515152,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiview_reasoning_camera_moving",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rocks_samples_identify",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.18627450980392157,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_rating",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "comic_page_ordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_event",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_robot",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "multilingual_news_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "move_pos_to_pos_hanoi_4_pole",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting_multi_image",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_translation_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_sign_recognition",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.2380952380952381,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_author",
+ "score": 0.3125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.0016611295681063123,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_action_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "functionality_matching_in_different_objects",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "Movie_retrieval_by_actor",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "TV_show_retrieval_by_character",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "Forensic_Detection_of_different_images",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pokemon_3D_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_rating",
+ "score": 0.3937189896097942,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_basic",
+ "score": 0.11764705882352941,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "waldo",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_line_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_eval_dynamic_pref",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.15789473684210525,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_Python",
+ "score": 0.020833333333333332,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.028571428571428574,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_parasite_detection",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_location",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "rebus",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat_position_only_length",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_xor_images",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_images",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_position_only",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_contain_images",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_tiktok",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_notes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_alipay",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_instagram",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_ppt",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_leetcode",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_twitter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_zoom",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_iphone_settings",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_youtube",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_excel",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_word",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.07058823529411766,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_T2I_reward",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_motion_matching_3D_real",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "video_motion_matching_real_3D",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ball_cup_swap_3",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.01020408163265306,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.05257936507936508,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_subject",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_control",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_aesthetics",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_mask",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_semantics",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_motion_guided_editing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "autorater_3d_model_texturing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_unmask",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "painting_QA",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_add_tag",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_output_result",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.2631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.10800000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.3071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.22142857142857145,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.49999999999999994,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.2928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.5793103448275864,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "doc_vqa",
+ "score": 0.625,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "generated_video_artifacts",
+ "score": 0.018750000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "sceneqa_scene_transition_video",
+ "score": 0.1642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "nextqa_oe",
+ "score": 0.20526315789473684,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.3571428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.5931034482758621,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.33571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.603448275862069,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_content_follow_up",
+ "score": 0.34285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.6827586206896553,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "wikihow_complex_task_completion",
+ "score": 0.3444444444444444,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.4357142857142856,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_detail_description",
+ "score": 0.26842105263157895,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video2notes",
+ "score": 0.15,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "funqa_unexpected_action_magic_video",
+ "score": 0.26666666666666666,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.2172413793103448,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_summary",
+ "score": 0.12142857142857146,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.23793103448275857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_creative_video",
+ "score": 0.1866666666666667,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.5379310344827587,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.47419354838709676,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.38965517241379305,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.21428571428571427,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.22666666666666666,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_humor_video",
+ "score": 0.14666666666666667,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_short_title",
+ "score": 0.2857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.5192307692307693,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.07777777777777778,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "activitynetqa",
+ "score": 0.21052631578947367,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.07142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.42068965517241386,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.7631578947368421,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.45862068965517244,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_writing",
+ "score": 0.3466666666666667,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.049999999999999996,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.21428571428571427,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.1285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.0071428571428571435,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.021428571428571432,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.0071428571428571435,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.0071428571428571435,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.0071428571428571435,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.2,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_B",
+ "score": 0.9,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.6500000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.8800000000000002,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_A",
+ "score": 0.9,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.6800000000000002,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.836842105263158,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.265,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Spanish",
+ "score": 0.14285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Arabic",
+ "score": 0.16428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Russian",
+ "score": 0.15,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_French",
+ "score": 0.2571428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Chinese",
+ "score": 0.13571428571428573,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.2857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/Default/InternVL2_2B/summary_results.json b/static/eval_results/Default/InternVL2_2B/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..0243043e9c2981fd3df05c1a2f24eb91964c05ea
--- /dev/null
+++ b/static/eval_results/Default/InternVL2_2B/summary_results.json
@@ -0,0 +1,251 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 440,
+ "num_eval_samples": 6539,
+ "macro_mean_score": 0.13141974398938763,
+ "micro_mean_score": 0.13063500716262516
+ },
+ "open": {
+ "num_eval_tasks": 65,
+ "num_eval_samples": 1163,
+ "macro_mean_score": 0.23864417043743646,
+ "micro_mean_score": 0.24901117798796224
+ },
+ "overall_score": 0.14522090778963154
+ },
+ "keyword_stats": {
+ "skills": {
+ "Object Recognition and Classification": {
+ "count": 303,
+ "num_samples": 4755,
+ "tasks": [],
+ "average_score": 0.14491178903291552
+ },
+ "Text Recognition (OCR)": {
+ "count": 137,
+ "num_samples": 2239,
+ "tasks": [],
+ "average_score": 0.12126906675624163
+ },
+ "Language Understanding and Generation": {
+ "count": 154,
+ "num_samples": 2509,
+ "tasks": [],
+ "average_score": 0.16912754929321935
+ },
+ "Scene and Event Understanding": {
+ "count": 154,
+ "num_samples": 2467,
+ "tasks": [],
+ "average_score": 0.18542274192083463
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 109,
+ "num_samples": 1910,
+ "tasks": [],
+ "average_score": 0.13923308734553164
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 51,
+ "num_samples": 855,
+ "tasks": [],
+ "average_score": 0.23992252224543772
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 15,
+ "num_samples": 245,
+ "tasks": [],
+ "average_score": 0.3420927318295739
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 77,
+ "num_samples": 1386,
+ "tasks": [],
+ "average_score": 0.14807577209152425
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 152,
+ "num_samples": 2437,
+ "tasks": [],
+ "average_score": 0.13036555933925006
+ },
+ "Planning and Decision Making": {
+ "count": 37,
+ "num_samples": 577,
+ "tasks": [],
+ "average_score": 0.01727799227799228
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 93,
+ "num_samples": 1517,
+ "tasks": [],
+ "average_score": 0.057021136657850864
+ },
+ "Text-Based Images and Documents": {
+ "count": 82,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.10504085961245285
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 101,
+ "num_samples": 1718,
+ "tasks": [],
+ "average_score": 0.1625198552182714
+ },
+ "Videos": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.18999779001767986
+ },
+ "Artistic and Creative Content": {
+ "count": 32,
+ "num_samples": 541,
+ "tasks": [],
+ "average_score": 0.1487677475708977
+ },
+ "Photographs": {
+ "count": 143,
+ "num_samples": 2248,
+ "tasks": [],
+ "average_score": 0.2011727338536935
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 11,
+ "num_samples": 169,
+ "tasks": [],
+ "average_score": 0.11886936592818943
+ }
+ },
+ "output_format": {
+ "contextual_formatted_text": {
+ "count": 98,
+ "num_samples": 1514,
+ "tasks": [],
+ "average_score": 0.1131404778887607
+ },
+ "structured_output": {
+ "count": 110,
+ "num_samples": 1714,
+ "tasks": [],
+ "average_score": 0.05739750616837997
+ },
+ "exact_text": {
+ "count": 83,
+ "num_samples": 1278,
+ "tasks": [],
+ "average_score": 0.15465451663650032
+ },
+ "numerical_data": {
+ "count": 49,
+ "num_samples": 862,
+ "tasks": [],
+ "average_score": 0.16044698450090833
+ },
+ "open_ended_output": {
+ "count": 80,
+ "num_samples": 1454,
+ "tasks": [],
+ "average_score": 0.21429521387724249
+ },
+ "multiple_choice": {
+ "count": 85,
+ "num_samples": 1363,
+ "tasks": [],
+ "average_score": 0.2128614316540013
+ }
+ },
+ "input_num": {
+ "6-8 images": {
+ "count": 21,
+ "num_samples": 314,
+ "tasks": [],
+ "average_score": 0.03658352229780801
+ },
+ "9-image or more": {
+ "count": 41,
+ "num_samples": 623,
+ "tasks": [],
+ "average_score": 0.05757839721254354
+ },
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.15225683687839608
+ },
+ "video": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.18999779001767986
+ },
+ "4-5 images": {
+ "count": 34,
+ "num_samples": 520,
+ "tasks": [],
+ "average_score": 0.17677460549936644
+ },
+ "2-3 images": {
+ "count": 51,
+ "num_samples": 802,
+ "tasks": [],
+ "average_score": 0.158165588340436
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 72,
+ "num_samples": 1124,
+ "tasks": [],
+ "average_score": 0.08722661966805
+ },
+ "Planning": {
+ "count": 78,
+ "num_samples": 1239,
+ "tasks": [],
+ "average_score": 0.04102853815875594
+ },
+ "Coding": {
+ "count": 31,
+ "num_samples": 474,
+ "tasks": [],
+ "average_score": 0.11264043251709285
+ },
+ "Perception": {
+ "count": 145,
+ "num_samples": 2313,
+ "tasks": [],
+ "average_score": 0.17001758160301803
+ },
+ "Metrics": {
+ "count": 20,
+ "num_samples": 309,
+ "tasks": [],
+ "average_score": 0.3332891958712894
+ },
+ "Science": {
+ "count": 29,
+ "num_samples": 574,
+ "tasks": [],
+ "average_score": 0.1686125516807394
+ },
+ "Knowledge": {
+ "count": 97,
+ "num_samples": 1605,
+ "tasks": [],
+ "average_score": 0.21169137106199268
+ },
+ "Mathematics": {
+ "count": 33,
+ "num_samples": 547,
+ "tasks": [],
+ "average_score": 0.10975764217070672
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/Default/InternVL2_2B/task_results.json b/static/eval_results/Default/InternVL2_2B/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c19e281eb17189df0b11341391798694f472783
--- /dev/null
+++ b/static/eval_results/Default/InternVL2_2B/task_results.json
@@ -0,0 +1,7756 @@
+[
+ {
+ "name": "functionality_matching_in_different_objects",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_rating",
+ "score": 0.6370339174257883,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.04081632653061224,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "recipe_image_ordering",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "rocks_samples_identify",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "sign_language",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "2d_image_jigsaw_puzzle_easy",
+ "score": 0.04047619047619048,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "booking_web_recommendation",
+ "score": 0.2936507936507936,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "painting_QA",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.29411764705882354,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_dynamic_pref",
+ "score": 0.3125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_keywords_based_retrieval_non_radiology",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.04081632653061224,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_hindi_next_step",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_temporal",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.3684210526315789,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_rating",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_eval_visual_pref",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_parasite_detection",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.023809523809523808,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_visualization_output_understanding",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 10,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_advanced",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiview_reasoning_camera_moving",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.010714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_action_recognition",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_author",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "rocks_samples_compare",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "comic_page_ordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.9761904761904762,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_news_qa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "paper_review_acceptance",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "counting_multi_image",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "photoshop_operation",
+ "score": 0.02976190476190476,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "vln_identify_robot",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_tegulu_next_step",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "medical_abdomen_endscopy_organ_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_direction_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "semantic_matching_of_two_images",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.0707070707070707,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pokemon_3D_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "visual_correspondance_in_two_images",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video_eval_factual_pref",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_Python",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_event",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_name",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_content_based_retrieval_radiology",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_segments_reordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rebus",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "worldle",
+ "score": 0.0071428571428571435,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Movie_retrieval_by_actor",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "music_sheet_sentiment",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "code_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.11764705882352941,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_intent_recognition",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.20096524579696584,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ancient_map_understanding",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "media_recommend_solutions_stackoverflow",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "vln_english_next_step",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "waldo",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_polyp_segmentation_single_object_rater",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_match_problem",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_solution_compare",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_spatial",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "logical_reasoning_2d_folding",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_reasoning",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "TV_show_retrieval_by_character",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_location",
+ "score": 0.006060606060606061,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "planning_visual_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_abdomen_MRI_organ_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_sign_recognition",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "Forensic_Detection_of_different_images",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.18421052631578946,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_2D_views_of_3D_shapes",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "dish_ingredient_match",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "media_homepage_profile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_camera_motion_description",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "video"
+ },
+ {
+ "name": "google_streetview_line_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "move_pos_to_pos_hanoi_4_pole",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_line_reasoning",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_view",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_retrieval_given_surgeon_activity",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Videos",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_basic",
+ "score": 0.23529411764705882,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_add_tag",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.2380952380952381,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.4296473684210526,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_view_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.848854419078294,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_vehicle_multiview_counting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_video_moving_object_property_recognition",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "clevrer_video_moving_object_count",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 21,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "sta_action_localization_video",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "code_programming_test_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "graph_hamiltonian_cycle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vizwiz_quality_accessment_for_blind",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_check_two_images",
+ "score": 0.24444444444444446,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "dvqa",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cheapest_flight_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "code_programming_extremely_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_retrieval",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.3227898751277714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_load_type_prediction_from_plot",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_multicamera_tracking_predict_bbox",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "perception_test_video_character_order",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.2597402597402597,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_mc",
+ "score": 0.8947368421052632,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.26914285714285713,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "algebra",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_sequence",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_similar_scene_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "star_object_interaction_video",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.10526315789473684,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_moving_direction_video",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "IAM_line_ocr_and_locate",
+ "score": 0.057101180112838316,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.2217857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_object_shuffle_video",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "muma_theory_of_mind_social_goal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "muma_theory_of_mind_belief_of_goal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevr_arithmetic",
+ "score": 0.15789473684210525,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.8666499999999999,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.12857142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arc_agi",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.0008403361344537821,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_to_camera_trajectory_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.34033613445378147,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualdial_visual_dialog_image_guessing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vlnqa_egocentric_navigation_video",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_output_result",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "face_identity_matching",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "music_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "clevrer_object_existence_video",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "cam_traj_to_video_selection",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_content_reasoning",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "3d_fragments_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.21052631578947367,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.48157894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_error_line_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_programming_test_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logo2k_same_type_logo_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.10526315789473684,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_prediction",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.5879999999999997,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "photo_sharing_image_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "emotion_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_human_multiview_counting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "perception_test_video_action_count",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "stock_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_path",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "coco_ood_global_image_retrieval_by_query_property",
+ "score": 0.2861111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.513578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_T2I_reward",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_motion_matching_real_3D",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "video_motion_matching_3D_real",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.047619047619047616,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_position_images",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_xor_images",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_position_only",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_images",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat_position_only_length",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_youtube",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_iphone_settings",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_notes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_word",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_alipay",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_twitter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_leetcode",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_tiktok",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_excel",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_instagram",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_ppt",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_zoom",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_3d_model_texturing",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_subject",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_unmask",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_mask",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_aesthetics",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_motion_guided_editing",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_control",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_semantics",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.10076530612244897,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.08602607709750568,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ball_cup_swap_3",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geographic_remote_sensing_land_cover",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visual_prediction_rater_novel_view_synthesis",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_semantic_segmentation",
+ "score": 0.041666666666666664,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_panoptic_segmentation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_surface_normal_estimation",
+ "score": 0.11904761904761904,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_openable_part_segmentation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_plane_segmentation",
+ "score": 0.17777777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_depth_estimation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_3d_assembled_quality_understanding",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.23103448275862065,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.25000000000000006,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.11379310344827588,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.03793103448275863,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sceneqa_scene_transition_video",
+ "score": 0.2214285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "wikihow_complex_task_completion",
+ "score": 0.13333333333333336,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "funqa_unexpected_action_humor_video",
+ "score": 0.24000000000000005,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.4,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.3551724137931033,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "activitynetqa",
+ "score": 0.3368421052631579,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "generated_video_artifacts",
+ "score": 0.16874999999999998,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "paper_review_writing",
+ "score": 0.29333333333333333,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.2777777777777778,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_creative_video",
+ "score": 0.15333333333333335,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.18620689655172415,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.2551724137931034,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.3379310344827585,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "doc_vqa",
+ "score": 0.5687500000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "video2notes",
+ "score": 0.05714285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.18275862068965518,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_short_title",
+ "score": 0.11428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.29333333333333333,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_oe",
+ "score": 0.3578947368421052,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.40714285714285703,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_summary",
+ "score": 0.07142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.08965517241379312,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.24999999999999997,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_content_follow_up",
+ "score": 0.24285714285714288,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.25,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_qa",
+ "score": 0.45,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.4923076923076924,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.45000000000000007,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.032,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.2928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.3870967741935483,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_magic_video",
+ "score": 0.2333333333333333,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.2517241379310345,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.18421052631578946,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_detail_description",
+ "score": 0.09473684210526315,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.16428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.35714285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.05714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.04285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.1,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.0642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.06428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.34285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.11428571428571431,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Spanish",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_French",
+ "score": 0.0642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Russian",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Chinese",
+ "score": 0.12857142857142856,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Arabic",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "red_teaming_visual_order_B",
+ "score": 0.7157894736842105,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.6421052631578948,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.7150000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.565,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.29,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_A",
+ "score": 0.668421052631579,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.5699999999999998,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/Default/InternVL2_5_2B/summary_results.json b/static/eval_results/Default/InternVL2_5_2B/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..f8718f5d302518ea84ef84b781f4f5270625aa50
--- /dev/null
+++ b/static/eval_results/Default/InternVL2_5_2B/summary_results.json
@@ -0,0 +1,251 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 440,
+ "num_eval_samples": 6539,
+ "macro_mean_score": 0.17806821966478364,
+ "micro_mean_score": 0.17708809739236367
+ },
+ "open": {
+ "num_eval_tasks": 65,
+ "num_eval_samples": 1163,
+ "macro_mean_score": 0.2738430375585404,
+ "micro_mean_score": 0.2905417024935512
+ },
+ "overall_score": 0.19039567147289096
+ },
+ "keyword_stats": {
+ "skills": {
+ "Object Recognition and Classification": {
+ "count": 303,
+ "num_samples": 4755,
+ "tasks": [],
+ "average_score": 0.19614682488147464
+ },
+ "Text Recognition (OCR)": {
+ "count": 137,
+ "num_samples": 2239,
+ "tasks": [],
+ "average_score": 0.18910947570579717
+ },
+ "Language Understanding and Generation": {
+ "count": 154,
+ "num_samples": 2509,
+ "tasks": [],
+ "average_score": 0.20543964378430513
+ },
+ "Scene and Event Understanding": {
+ "count": 154,
+ "num_samples": 2467,
+ "tasks": [],
+ "average_score": 0.23636598588530347
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 109,
+ "num_samples": 1910,
+ "tasks": [],
+ "average_score": 0.15691382827270517
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 51,
+ "num_samples": 855,
+ "tasks": [],
+ "average_score": 0.28604169870255614
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 15,
+ "num_samples": 245,
+ "tasks": [],
+ "average_score": 0.4248446115288219
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 77,
+ "num_samples": 1386,
+ "tasks": [],
+ "average_score": 0.18745928331343714
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 152,
+ "num_samples": 2437,
+ "tasks": [],
+ "average_score": 0.15097551654513372
+ },
+ "Planning and Decision Making": {
+ "count": 37,
+ "num_samples": 577,
+ "tasks": [],
+ "average_score": 0.030568378443583684
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 93,
+ "num_samples": 1517,
+ "tasks": [],
+ "average_score": 0.13898447520398388
+ },
+ "Text-Based Images and Documents": {
+ "count": 82,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.13154711942685113
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 101,
+ "num_samples": 1718,
+ "tasks": [],
+ "average_score": 0.18343540213068474
+ },
+ "Videos": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.20755556526976354
+ },
+ "Artistic and Creative Content": {
+ "count": 32,
+ "num_samples": 541,
+ "tasks": [],
+ "average_score": 0.15983467048343838
+ },
+ "Photographs": {
+ "count": 143,
+ "num_samples": 2248,
+ "tasks": [],
+ "average_score": 0.26888883087046195
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 11,
+ "num_samples": 169,
+ "tasks": [],
+ "average_score": 0.12906517409932386
+ }
+ },
+ "output_format": {
+ "contextual_formatted_text": {
+ "count": 98,
+ "num_samples": 1514,
+ "tasks": [],
+ "average_score": 0.14702422379343882
+ },
+ "structured_output": {
+ "count": 110,
+ "num_samples": 1714,
+ "tasks": [],
+ "average_score": 0.15324148486802894
+ },
+ "exact_text": {
+ "count": 83,
+ "num_samples": 1278,
+ "tasks": [],
+ "average_score": 0.19977956414542175
+ },
+ "numerical_data": {
+ "count": 49,
+ "num_samples": 862,
+ "tasks": [],
+ "average_score": 0.1665590610582109
+ },
+ "open_ended_output": {
+ "count": 80,
+ "num_samples": 1454,
+ "tasks": [],
+ "average_score": 0.2529339759528222
+ },
+ "multiple_choice": {
+ "count": 85,
+ "num_samples": 1363,
+ "tasks": [],
+ "average_score": 0.23420071687554841
+ }
+ },
+ "input_num": {
+ "6-8 images": {
+ "count": 21,
+ "num_samples": 314,
+ "tasks": [],
+ "average_score": 0.09651832955404382
+ },
+ "9-image or more": {
+ "count": 41,
+ "num_samples": 623,
+ "tasks": [],
+ "average_score": 0.0784280378818194
+ },
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.21260786581183966
+ },
+ "video": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.20755556526976354
+ },
+ "4-5 images": {
+ "count": 34,
+ "num_samples": 520,
+ "tasks": [],
+ "average_score": 0.138285387531761
+ },
+ "2-3 images": {
+ "count": 51,
+ "num_samples": 802,
+ "tasks": [],
+ "average_score": 0.20214332169825855
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 72,
+ "num_samples": 1124,
+ "tasks": [],
+ "average_score": 0.18128339685489062
+ },
+ "Planning": {
+ "count": 78,
+ "num_samples": 1239,
+ "tasks": [],
+ "average_score": 0.053153113565753
+ },
+ "Coding": {
+ "count": 31,
+ "num_samples": 474,
+ "tasks": [],
+ "average_score": 0.12416116984428181
+ },
+ "Perception": {
+ "count": 145,
+ "num_samples": 2313,
+ "tasks": [],
+ "average_score": 0.22449772657901465
+ },
+ "Metrics": {
+ "count": 20,
+ "num_samples": 309,
+ "tasks": [],
+ "average_score": 0.3762336977650326
+ },
+ "Science": {
+ "count": 29,
+ "num_samples": 574,
+ "tasks": [],
+ "average_score": 0.19222024833691936
+ },
+ "Knowledge": {
+ "count": 97,
+ "num_samples": 1605,
+ "tasks": [],
+ "average_score": 0.25056132494721467
+ },
+ "Mathematics": {
+ "count": 33,
+ "num_samples": 547,
+ "tasks": [],
+ "average_score": 0.15596334442569906
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/Default/InternVL2_5_2B/task_results.json b/static/eval_results/Default/InternVL2_5_2B/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..34ebcde570112e313b86906890ff139daf9e7cc4
--- /dev/null
+++ b/static/eval_results/Default/InternVL2_5_2B/task_results.json
@@ -0,0 +1,7756 @@
+[
+ {
+ "name": "graph_maxflow",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arc_agi",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_prediction",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "photo_sharing_image_retrieval",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "emotion_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "code_error_line_identification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.3857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.48,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.04632755935026561,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.3111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.08040063592083609,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_path",
+ "score": 0.38244047619047616,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "weather_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "cheapest_flight_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.6688311688311688,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logo2k_same_type_logo_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.08458208458208459,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_object_shuffle_video",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.17777777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.05114285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.42105263157894735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_output_result",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.3684210526315789,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_content_reasoning",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.21028571428571435,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.10526315789473684,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualdial_visual_dialog_image_guessing",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.18817731556471845,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_to_camera_trajectory_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "algebra",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_mc",
+ "score": 0.9473684210526315,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "face_identity_matching",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.1174011354666419,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_video_character_order",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_video_moving_object_property_recognition",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_vehicle_multiview_counting",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_load_type_prediction_from_plot",
+ "score": 0.4523809523809523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_ood_global_image_retrieval_by_query_property",
+ "score": 0.06547619047619047,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.013860848714248784,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_video_action_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "IAM_line_ocr_and_locate",
+ "score": 0.10210634994992598,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "landmark_check_two_images",
+ "score": 0.04444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "av_view_identification",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.513578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "clevrer_moving_direction_video",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.4592092436351974,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "muma_theory_of_mind_social_goal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "places365_similar_scene_retrieval",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_hamiltonian_cycle",
+ "score": 0.14047619047619048,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.29464285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "muma_theory_of_mind_belief_of_goal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "clevrer_video_moving_object_count",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 21,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "star_object_interaction_video",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.7091142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vizwiz_quality_accessment_for_blind",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "av_human_multiview_counting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vlnqa_egocentric_navigation_video",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "sta_action_localization_video",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.5842105263157894,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "dvqa",
+ "score": 0.42105263157894735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.35777368421052635,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_object_existence_video",
+ "score": 0.6875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.05999999999999999,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevr_arithmetic",
+ "score": 0.10526315789473684,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_extremely_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.7226890756302522,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_fragments_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cam_traj_to_video_selection",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.2959183673469387,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_sequence",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.46428571428571425,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.11666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "av_multicamera_tracking_predict_bbox",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.40476190476190477,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.20535714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_segments_reordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "recipe_image_ordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "music_sheet_sentiment",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "waldo",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_direction_understanding",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_abdomen_MRI_organ_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_parasite_detection",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "photoshop_operation",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "vln_identify_robot",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_correspondance_in_two_images",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.12121212121212122,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_tegulu_next_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_match_problem",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "multiview_reasoning_camera_moving",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "paper_review_rating",
+ "score": 0.764197764824463,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "dish_ingredient_match",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "TV_show_retrieval_by_character",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.02040816326530612,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_reasoning",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_2D_views_of_3D_shapes",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "video_eval_visual_pref",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "code_solution_compare",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_location",
+ "score": 0.030303030303030307,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "worldle",
+ "score": 0.049999999999999996,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "booking_web_recommendation",
+ "score": 0.2792517006802721,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.3,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "paper_review_acceptance",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rocks_samples_compare",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video_action_recognition",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "semantic_matching_of_two_images",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_sign_recognition",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_keywords_based_retrieval_non_radiology",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "move_pos_to_pos_hanoi_4_pole",
+ "score": 0.0503968253968254,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "2d_image_jigsaw_puzzle_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_intent_recognition",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_name",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "code_add_tag",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_line_reasoning",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.02040816326530612,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.29411764705882354,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.05714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_camera_motion_description",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "video"
+ },
+ {
+ "name": "media_homepage_profile",
+ "score": 0.0071428571428571435,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_basic",
+ "score": 0.23529411764705882,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_view",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_retrieval_given_surgeon_activity",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Videos",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Movie_retrieval_by_actor",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "google_streetview_line_sorting",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.8157894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_polyp_segmentation_single_object_rater",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "video_eval_factual_pref",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.017857142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "Forensic_Detection_of_different_images",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_2d_folding",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "rebus",
+ "score": 0.043478260869565216,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_Python",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_content_based_retrieval_radiology",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "sign_language",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_author",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_event",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "media_recommend_solutions_stackoverflow",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "comic_page_ordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting_multi_image",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_translation_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.1615633519949754,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_abdomen_endscopy_organ_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_news_qa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.11764705882352941,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_hindi_next_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "planning_visual_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video_grounding_spatial",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.24841274279293252,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "ancient_map_understanding",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.3095238095238095,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "painting_QA",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_dynamic_pref",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "booking_web_rating",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "functionality_matching_in_different_objects",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "rocks_samples_identify",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.033620994446927885,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pokemon_3D_recognition",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "video_grounding_temporal",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_english_next_step",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "code_visualization_output_understanding",
+ "score": 0.1,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 10,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.028571428571428574,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.16666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geographic_remote_sensing_land_cover",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visual_prediction_rater_semantic_segmentation",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_surface_normal_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_novel_view_synthesis",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_openable_part_segmentation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_depth_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_3d_assembled_quality_understanding",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visual_prediction_rater_panoptic_segmentation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_plane_segmentation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_T2I_reward",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.40476190476190477,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ball_cup_swap_3",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_images",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat_position_only_length",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_multi_contain_position_only",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_images",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_xor_images",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_unmask",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_mask",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_control",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_aesthetics",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_subject",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_semantics",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_motion_guided_editing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_3d_model_texturing",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.1304421768707483,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.10374149659863945,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_motion_matching_3D_real",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "video_motion_matching_real_3D",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.41666666666666663,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.22352941176470592,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_instagram",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_leetcode",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_alipay",
+ "score": 0.11764705882352941,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_iphone_settings",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_zoom",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_ppt",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_notes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_excel",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_youtube",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_amazon",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_twitter",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_tiktok",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_word",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.21428571428571427,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "generated_video_artifacts",
+ "score": 0.11250000000000002,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.2571428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_detail_description",
+ "score": 0.2210526315789474,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "doc_vqa",
+ "score": 0.6437499999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "video2notes",
+ "score": 0.05,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.6,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "wikihow_complex_task_completion",
+ "score": 0.08888888888888888,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_qa",
+ "score": 0.5285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "sceneqa_scene_transition_video",
+ "score": 0.3071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.3222222222222222,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.3620689655172413,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.14285714285714288,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.39285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.35,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.4214285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_content_follow_up",
+ "score": 0.03571428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.2285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.028571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_humor_video",
+ "score": 0.21333333333333337,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.26896551724137924,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.6346153846153848,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.17241379310344832,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.07931034482758621,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.3,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_short_title",
+ "score": 0.4285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "funqa_unexpected_action_creative_video",
+ "score": 0.21333333333333335,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.6285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.10714285714285716,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.07600000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.41379310344827597,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_writing",
+ "score": 0.22,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "nextqa_oe",
+ "score": 0.24736842105263163,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.38965517241379305,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.20344827586206896,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "activitynetqa",
+ "score": 0.4526315789473684,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.28,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_magic_video",
+ "score": 0.3133333333333334,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.48064516129032264,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_summary",
+ "score": 0.1642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.12068965517241381,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.18620689655172415,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Spanish",
+ "score": 0.042857142857142864,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Chinese",
+ "score": 0.08571428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Russian",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Arabic",
+ "score": 0.03571428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_French",
+ "score": 0.014285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.021428571428571432,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.06428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.028571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.09999999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.014285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.021428571428571432,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.44285714285714295,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_B",
+ "score": 0.6631578947368422,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.11052631578947371,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.51,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.73,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_A",
+ "score": 0.7157894736842106,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.7,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.615,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.7105263157894738,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.11428571428571431,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.021428571428571432,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.13571428571428573,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/Default/InternVL2_5_78B/summary_results.json b/static/eval_results/Default/InternVL2_5_78B/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..055e16e870658b5ad33e53b229171e8ec80d837e
--- /dev/null
+++ b/static/eval_results/Default/InternVL2_5_78B/summary_results.json
@@ -0,0 +1,251 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 440,
+ "num_eval_samples": 6539,
+ "macro_mean_score": 0.44132952988532753,
+ "micro_mean_score": 0.4397079059379812
+ },
+ "open": {
+ "num_eval_tasks": 65,
+ "num_eval_samples": 1163,
+ "macro_mean_score": 0.5538024772749066,
+ "micro_mean_score": 0.5776870163370592
+ },
+ "overall_score": 0.4558062458859664
+ },
+ "keyword_stats": {
+ "skills": {
+ "Object Recognition and Classification": {
+ "count": 303,
+ "num_samples": 4755,
+ "tasks": [],
+ "average_score": 0.46893853078050696
+ },
+ "Text Recognition (OCR)": {
+ "count": 137,
+ "num_samples": 2239,
+ "tasks": [],
+ "average_score": 0.5220829627238773
+ },
+ "Language Understanding and Generation": {
+ "count": 154,
+ "num_samples": 2509,
+ "tasks": [],
+ "average_score": 0.4933134095077618
+ },
+ "Scene and Event Understanding": {
+ "count": 154,
+ "num_samples": 2467,
+ "tasks": [],
+ "average_score": 0.477971701185214
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 109,
+ "num_samples": 1910,
+ "tasks": [],
+ "average_score": 0.3936387335462224
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 51,
+ "num_samples": 855,
+ "tasks": [],
+ "average_score": 0.5610278744213835
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 15,
+ "num_samples": 245,
+ "tasks": [],
+ "average_score": 0.6072907268170428
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 77,
+ "num_samples": 1386,
+ "tasks": [],
+ "average_score": 0.44533550848682696
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 152,
+ "num_samples": 2437,
+ "tasks": [],
+ "average_score": 0.3548055654857457
+ },
+ "Planning and Decision Making": {
+ "count": 37,
+ "num_samples": 577,
+ "tasks": [],
+ "average_score": 0.22852234519925363
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 93,
+ "num_samples": 1517,
+ "tasks": [],
+ "average_score": 0.4910486370158392
+ },
+ "Text-Based Images and Documents": {
+ "count": 82,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.39410061025954557
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 101,
+ "num_samples": 1718,
+ "tasks": [],
+ "average_score": 0.43424133240430957
+ },
+ "Videos": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.5300255483670417
+ },
+ "Artistic and Creative Content": {
+ "count": 32,
+ "num_samples": 541,
+ "tasks": [],
+ "average_score": 0.4793195260560365
+ },
+ "Photographs": {
+ "count": 143,
+ "num_samples": 2248,
+ "tasks": [],
+ "average_score": 0.4622918421665308
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 11,
+ "num_samples": 169,
+ "tasks": [],
+ "average_score": 0.3729954065847296
+ }
+ },
+ "output_format": {
+ "contextual_formatted_text": {
+ "count": 98,
+ "num_samples": 1514,
+ "tasks": [],
+ "average_score": 0.4226567593431527
+ },
+ "structured_output": {
+ "count": 110,
+ "num_samples": 1714,
+ "tasks": [],
+ "average_score": 0.4149806887502539
+ },
+ "exact_text": {
+ "count": 83,
+ "num_samples": 1278,
+ "tasks": [],
+ "average_score": 0.4904285184890861
+ },
+ "numerical_data": {
+ "count": 49,
+ "num_samples": 862,
+ "tasks": [],
+ "average_score": 0.4348674018783908
+ },
+ "open_ended_output": {
+ "count": 80,
+ "num_samples": 1454,
+ "tasks": [],
+ "average_score": 0.5124942746906233
+ },
+ "multiple_choice": {
+ "count": 85,
+ "num_samples": 1363,
+ "tasks": [],
+ "average_score": 0.4717682857925982
+ }
+ },
+ "input_num": {
+ "6-8 images": {
+ "count": 21,
+ "num_samples": 314,
+ "tasks": [],
+ "average_score": 0.20496909081092754
+ },
+ "9-image or more": {
+ "count": 41,
+ "num_samples": 623,
+ "tasks": [],
+ "average_score": 0.4184724897299287
+ },
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.4951997132559491
+ },
+ "video": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.5300255483670417
+ },
+ "4-5 images": {
+ "count": 34,
+ "num_samples": 520,
+ "tasks": [],
+ "average_score": 0.286105084660728
+ },
+ "2-3 images": {
+ "count": 51,
+ "num_samples": 802,
+ "tasks": [],
+ "average_score": 0.39635000103107665
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 72,
+ "num_samples": 1124,
+ "tasks": [],
+ "average_score": 0.5401547630322637
+ },
+ "Planning": {
+ "count": 78,
+ "num_samples": 1239,
+ "tasks": [],
+ "average_score": 0.26403470419652064
+ },
+ "Coding": {
+ "count": 31,
+ "num_samples": 474,
+ "tasks": [],
+ "average_score": 0.3933356676003734
+ },
+ "Perception": {
+ "count": 145,
+ "num_samples": 2313,
+ "tasks": [],
+ "average_score": 0.5168098196770042
+ },
+ "Metrics": {
+ "count": 20,
+ "num_samples": 309,
+ "tasks": [],
+ "average_score": 0.47731479110938463
+ },
+ "Science": {
+ "count": 29,
+ "num_samples": 574,
+ "tasks": [],
+ "average_score": 0.4388571290145052
+ },
+ "Knowledge": {
+ "count": 97,
+ "num_samples": 1605,
+ "tasks": [],
+ "average_score": 0.5034762755043025
+ },
+ "Mathematics": {
+ "count": 33,
+ "num_samples": 547,
+ "tasks": [],
+ "average_score": 0.37742798395328586
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/Default/InternVL2_5_78B/task_results.json b/static/eval_results/Default/InternVL2_5_78B/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..3604ee9bfb5e3f354756eec202184714946b91da
--- /dev/null
+++ b/static/eval_results/Default/InternVL2_5_78B/task_results.json
@@ -0,0 +1,7756 @@
+[
+ {
+ "name": "face_identity_matching",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "IAM_line_ocr_and_locate",
+ "score": 0.6517117230612683,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_retrieval",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_multicamera_tracking_predict_bbox",
+ "score": 0.012684730303418308,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevr_arithmetic",
+ "score": 0.6842105263157895,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.5833333333333334,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualdial_visual_dialog_image_guessing",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_object_existence_video",
+ "score": 0.875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.021928571428571488,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.6530612244897959,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "star_object_interaction_video",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_content_reasoning",
+ "score": 0.7777777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_similar_scene_retrieval",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cam_traj_to_video_selection",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "photo_sharing_image_retrieval",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "perception_test_video_character_order",
+ "score": 0.8125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "coco_ood_global_image_retrieval_by_query_property",
+ "score": 0.6583616780045353,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "arc_agi",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "vizwiz_quality_accessment_for_blind",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "av_human_multiview_counting",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "emotion_recognition",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.74,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.5694444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.45210923844984136,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.11345367411269795,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "dvqa",
+ "score": 0.8947368421052632,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.7291575502542685,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_moving_direction_video",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.9155844155844154,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_cycle",
+ "score": 0.14264455782312926,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "av_vehicle_multiview_counting",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.24406600635762046,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.6906666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sta_action_localization_video",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.43826443800341836,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_object_shuffle_video",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_hard",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_to_camera_trajectory_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "cheapest_flight_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_retrieval",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.4666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "muma_theory_of_mind_social_goal",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "clevrer_video_moving_object_property_recognition",
+ "score": 0.9375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "action_sequence",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "game_info_retrieval",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "av_view_identification",
+ "score": 0.21111111111111108,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "code_programming_test_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "muma_theory_of_mind_belief_of_goal",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "code_programming_extremely_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.96218487394958,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.7368421052631579,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_fragments_understanding",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.6914210526315789,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.8253968253968257,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.7769182644835021,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vlnqa_egocentric_navigation_video",
+ "score": 0.3125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "landmark_check_two_images",
+ "score": 0.6888888888888888,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_output_result",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.6821578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.7976428571428572,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_load_type_prediction_from_plot",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.6642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_prediction",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "code_error_line_identification",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.7767857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.9206349206349208,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_mc",
+ "score": 0.9473684210526315,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_retrieval",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_video_moving_object_count",
+ "score": 0.7619047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 21,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "algebra",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.9791142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.6518571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logo2k_same_type_logo_retrieval",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.2189325582280229,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.8375210526315788,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_path",
+ "score": 0.19583333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.3684210526315789,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.6160714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.8095238095238095,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_video_action_count",
+ "score": 0.6875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geographic_remote_sensing_land_cover",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.16666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.21428571428571425,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.20000000000000004,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.08571428571428572,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_plane_segmentation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_novel_view_synthesis",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_3d_assembled_quality_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visual_prediction_rater_semantic_segmentation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_panoptic_segmentation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_surface_normal_estimation",
+ "score": 0.5476190476190477,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_depth_estimation",
+ "score": 0.3095238095238095,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_openable_part_segmentation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_abdomen_MRI_organ_recognition",
+ "score": 0.3392857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_correspondance_in_two_images",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_circle_reasoning",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vln_tegulu_next_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_factual_pref",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_polyp_segmentation_single_object_rater",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "recipe_image_ordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_visualization_output_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 10,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_view",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video_grounding_temporal",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.04761904761904762,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_blocksworld",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.8888888888888888,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_english_next_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "logical_reasoning_2d_folding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.6785714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_abdomen_endscopy_organ_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "worldle",
+ "score": 0.2941859731424949,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_keywords_based_retrieval_non_radiology",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sign_language",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_camera_motion_description",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_segments_reordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.9107142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_spatial",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_match_problem",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "music_sheet_sentiment",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_visual_pref",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.437942009022375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "semantic_matching_of_two_images",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.2653061224489796,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_retrieval",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.7894736842105263,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_recommendation",
+ "score": 0.6312074829931973,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "photoshop_operation",
+ "score": 0.23333333333333334,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.6785714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_homepage_profile",
+ "score": 0.21370829033367733,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vln_hindi_next_step",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "media_recommend_solutions_stackoverflow",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "dish_ingredient_match",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "medical_retrieval_given_surgeon_activity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Videos",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "google_streetview_direction_understanding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "music_sheet_name",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "code_translation_easy",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ancient_map_understanding",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_add_tag",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_2D_views_of_3D_shapes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "paper_review_acceptance",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "rocks_samples_compare",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_line_reasoning",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_intent_recognition",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "counting",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.6666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_solution_compare",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "2d_image_jigsaw_puzzle_easy",
+ "score": 0.2119047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_content_based_retrieval_radiology",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.32323232323232326,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiview_reasoning_camera_moving",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rocks_samples_identify",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.4323529411764706,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_rating",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_grippers",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "comic_page_ordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_event",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.6045345424369317,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_robot",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "multilingual_news_qa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "move_pos_to_pos_hanoi_4_pole",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting_multi_image",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_translation_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.48214285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_sign_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.4523809523809523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "painting_QA",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "music_sheet_author",
+ "score": 0.3125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.014937888198757762,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_action_recognition",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "functionality_matching_in_different_objects",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "Movie_retrieval_by_actor",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "TV_show_retrieval_by_character",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "Forensic_Detection_of_different_images",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pokemon_3D_recognition",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_rating",
+ "score": 0.6423672507591217,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "number_comparison",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_basic",
+ "score": 0.8823529411764706,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "waldo",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_line_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_eval_dynamic_pref",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.2456140350877193,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_Python",
+ "score": 0.41666666666666663,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.6470588235294118,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.4428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_parasite_detection",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.47058823529411764,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_location",
+ "score": 0.1515151515151515,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "rebus",
+ "score": 0.2608695652173913,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat_position_only_length",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_xor_images",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_images",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_position_only",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_contain_images",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.8095238095238095,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_tiktok",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_notes",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_alipay",
+ "score": 0.5882352941176471,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_amazon",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_instagram",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_ppt",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_leetcode",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_twitter",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_zoom",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_iphone_settings",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_youtube",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_excel",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_word",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.32857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6470588235294119,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_T2I_reward",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_motion_matching_3D_real",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "video_motion_matching_real_3D",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ball_cup_swap_3",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.3770408163265306,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.5488378684807256,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.6333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_subject",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_control",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_aesthetics",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_mask",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_semantics",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_motion_guided_editing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "autorater_3d_model_texturing",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_unmask",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.15,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.8235294117647058,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.386,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.2928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.6642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.9142857142857144,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.6642857142857144,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.710344827586207,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "doc_vqa",
+ "score": 0.7125,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "generated_video_artifacts",
+ "score": 0.19999999999999998,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "sceneqa_scene_transition_video",
+ "score": 0.23571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "nextqa_oe",
+ "score": 0.3999999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.5785714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.7241379310344827,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.5499999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.8172413793103448,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_content_follow_up",
+ "score": 0.7571428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.7344827586206897,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "wikihow_complex_task_completion",
+ "score": 0.6222222222222222,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.7785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_detail_description",
+ "score": 0.4052631578947368,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video2notes",
+ "score": 0.5785714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "funqa_unexpected_action_magic_video",
+ "score": 0.43333333333333335,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_qa",
+ "score": 0.7928571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.6448275862068965,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_summary",
+ "score": 0.6214285714285716,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.5827586206896552,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_creative_video",
+ "score": 0.26666666666666666,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.7689655172413794,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.45806451612903226,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.7862068965517242,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.5142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.6866666666666666,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_humor_video",
+ "score": 0.3266666666666667,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_short_title",
+ "score": 0.6714285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.7538461538461539,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.6666666666666667,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "activitynetqa",
+ "score": 0.4947368421052632,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.3928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.7724137931034483,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.836842105263158,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.5655172413793103,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_writing",
+ "score": 0.52,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.32857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.28571428571428575,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.24999999999999997,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.5714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.55,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.2857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.4857142857142856,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.6571428571428573,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.5428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.6142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_B",
+ "score": 0.9,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.7750000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.7400000000000003,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_A",
+ "score": 0.9,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.6949999999999998,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8157894736842107,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.8450000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Spanish",
+ "score": 0.19285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Arabic",
+ "score": 0.2642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Russian",
+ "score": 0.2785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_French",
+ "score": 0.2571428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Chinese",
+ "score": 0.3071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.07142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/Default/InternVL2_76B/summary_results.json b/static/eval_results/Default/InternVL2_76B/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..2c244cef39cdaadb040968fc9007b1a1307168c5
--- /dev/null
+++ b/static/eval_results/Default/InternVL2_76B/summary_results.json
@@ -0,0 +1,251 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 440,
+ "num_eval_samples": 6539,
+ "macro_mean_score": 0.3562710424410931,
+ "micro_mean_score": 0.35129859801162616
+ },
+ "open": {
+ "num_eval_tasks": 65,
+ "num_eval_samples": 1163,
+ "macro_mean_score": 0.5192997443033639,
+ "micro_mean_score": 0.5421324161650903
+ },
+ "overall_score": 0.3772549347599992
+ },
+ "keyword_stats": {
+ "skills": {
+ "Object Recognition and Classification": {
+ "count": 303,
+ "num_samples": 4755,
+ "tasks": [],
+ "average_score": 0.38193012983650343
+ },
+ "Text Recognition (OCR)": {
+ "count": 137,
+ "num_samples": 2239,
+ "tasks": [],
+ "average_score": 0.41315219763443384
+ },
+ "Language Understanding and Generation": {
+ "count": 154,
+ "num_samples": 2509,
+ "tasks": [],
+ "average_score": 0.43665980552577693
+ },
+ "Scene and Event Understanding": {
+ "count": 154,
+ "num_samples": 2467,
+ "tasks": [],
+ "average_score": 0.4265623936500962
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 109,
+ "num_samples": 1910,
+ "tasks": [],
+ "average_score": 0.2975890791763991
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 51,
+ "num_samples": 855,
+ "tasks": [],
+ "average_score": 0.5257990949897898
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 15,
+ "num_samples": 245,
+ "tasks": [],
+ "average_score": 0.5779473684210527
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 77,
+ "num_samples": 1386,
+ "tasks": [],
+ "average_score": 0.33287081421166276
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 152,
+ "num_samples": 2437,
+ "tasks": [],
+ "average_score": 0.2949505390920417
+ },
+ "Planning and Decision Making": {
+ "count": 37,
+ "num_samples": 577,
+ "tasks": [],
+ "average_score": 0.17036496432397477
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 93,
+ "num_samples": 1517,
+ "tasks": [],
+ "average_score": 0.3634339625985008
+ },
+ "Text-Based Images and Documents": {
+ "count": 82,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.31396468806559114
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 101,
+ "num_samples": 1718,
+ "tasks": [],
+ "average_score": 0.3473756113126343
+ },
+ "Videos": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.395893002855977
+ },
+ "Artistic and Creative Content": {
+ "count": 32,
+ "num_samples": 541,
+ "tasks": [],
+ "average_score": 0.44982107744035305
+ },
+ "Photographs": {
+ "count": 143,
+ "num_samples": 2248,
+ "tasks": [],
+ "average_score": 0.42875248733027654
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 11,
+ "num_samples": 169,
+ "tasks": [],
+ "average_score": 0.2868239162778749
+ }
+ },
+ "output_format": {
+ "contextual_formatted_text": {
+ "count": 98,
+ "num_samples": 1514,
+ "tasks": [],
+ "average_score": 0.3630499545707523
+ },
+ "structured_output": {
+ "count": 110,
+ "num_samples": 1714,
+ "tasks": [],
+ "average_score": 0.3476691827105281
+ },
+ "exact_text": {
+ "count": 83,
+ "num_samples": 1278,
+ "tasks": [],
+ "average_score": 0.3943337471922549
+ },
+ "numerical_data": {
+ "count": 49,
+ "num_samples": 862,
+ "tasks": [],
+ "average_score": 0.29244088978470345
+ },
+ "open_ended_output": {
+ "count": 80,
+ "num_samples": 1454,
+ "tasks": [],
+ "average_score": 0.45822072478616577
+ },
+ "multiple_choice": {
+ "count": 85,
+ "num_samples": 1363,
+ "tasks": [],
+ "average_score": 0.3879326330400817
+ }
+ },
+ "input_num": {
+ "6-8 images": {
+ "count": 21,
+ "num_samples": 314,
+ "tasks": [],
+ "average_score": 0.20309901738473166
+ },
+ "9-image or more": {
+ "count": 41,
+ "num_samples": 623,
+ "tasks": [],
+ "average_score": 0.34771123515123364
+ },
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.4145693044465943
+ },
+ "video": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.395893002855977
+ },
+ "4-5 images": {
+ "count": 34,
+ "num_samples": 520,
+ "tasks": [],
+ "average_score": 0.24403942809507134
+ },
+ "2-3 images": {
+ "count": 51,
+ "num_samples": 802,
+ "tasks": [],
+ "average_score": 0.3153417935059416
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 72,
+ "num_samples": 1124,
+ "tasks": [],
+ "average_score": 0.4306947454508794
+ },
+ "Planning": {
+ "count": 78,
+ "num_samples": 1239,
+ "tasks": [],
+ "average_score": 0.2132321995754061
+ },
+ "Coding": {
+ "count": 31,
+ "num_samples": 474,
+ "tasks": [],
+ "average_score": 0.2953329718984368
+ },
+ "Perception": {
+ "count": 145,
+ "num_samples": 2313,
+ "tasks": [],
+ "average_score": 0.42202934355552685
+ },
+ "Metrics": {
+ "count": 20,
+ "num_samples": 309,
+ "tasks": [],
+ "average_score": 0.47409276729986083
+ },
+ "Science": {
+ "count": 29,
+ "num_samples": 574,
+ "tasks": [],
+ "average_score": 0.30014798153766264
+ },
+ "Knowledge": {
+ "count": 97,
+ "num_samples": 1605,
+ "tasks": [],
+ "average_score": 0.4625649385962016
+ },
+ "Mathematics": {
+ "count": 33,
+ "num_samples": 547,
+ "tasks": [],
+ "average_score": 0.2868813944130515
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/Default/InternVL2_76B/task_results.json b/static/eval_results/Default/InternVL2_76B/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..c1438f997ba8a34e91d10a3eca49d3091dd91ca3
--- /dev/null
+++ b/static/eval_results/Default/InternVL2_76B/task_results.json
@@ -0,0 +1,7756 @@
+[
+ {
+ "name": "face_identity_matching",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "IAM_line_ocr_and_locate",
+ "score": 0.5937981812316329,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_retrieval",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_multicamera_tracking_predict_bbox",
+ "score": 0.021818162950542508,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevr_arithmetic",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.4166666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualdial_visual_dialog_image_guessing",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_object_existence_video",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.07099999999999997,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.5612244897959183,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "star_object_interaction_video",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_content_reasoning",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_similar_scene_retrieval",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cam_traj_to_video_selection",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "photo_sharing_image_retrieval",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "perception_test_video_character_order",
+ "score": 0.6875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "coco_ood_global_image_retrieval_by_query_property",
+ "score": 0.6527777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "arc_agi",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "vizwiz_quality_accessment_for_blind",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "av_human_multiview_counting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "emotion_recognition",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.58,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.4633266171344593,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.04091742079331852,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "dvqa",
+ "score": 0.5789473684210527,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.4056667566073084,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_moving_direction_video",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.9155844155844154,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_cycle",
+ "score": 0.2642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "av_vehicle_multiview_counting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.5920000000000002,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sta_action_localization_video",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.3876035519415914,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_object_shuffle_video",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "video_to_camera_trajectory_retrieval",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "cheapest_flight_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_retrieval",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.5111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "muma_theory_of_mind_social_goal",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "clevrer_video_moving_object_property_recognition",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "action_sequence",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "game_info_retrieval",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "av_view_identification",
+ "score": 0.02222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "code_programming_test_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "muma_theory_of_mind_belief_of_goal",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "code_programming_extremely_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.9327731092436977,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_fragments_understanding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.5457894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.7936507936507937,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vlnqa_egocentric_navigation_video",
+ "score": 0.3125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "landmark_check_two_images",
+ "score": 0.42222222222222233,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.614578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.8522857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_load_type_prediction_from_plot",
+ "score": 0.4285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.5928571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_prediction",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "code_error_line_identification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.84375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.9007936507936509,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_mc",
+ "score": 0.8947368421052632,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_retrieval",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_video_moving_object_count",
+ "score": 0.23809523809523808,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 21,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "algebra",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.9578785714285712,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.46228571428571424,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logo2k_same_type_logo_retrieval",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.5137842105263157,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_path",
+ "score": 0.369047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.2631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.6339285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.8214285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_video_action_count",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.35555555555555557,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geographic_remote_sensing_land_cover",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.2777777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.042857142857142864,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.05714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.014285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.042857142857142864,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_plane_segmentation",
+ "score": 0.3555555555555555,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_novel_view_synthesis",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_3d_assembled_quality_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visual_prediction_rater_semantic_segmentation",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_panoptic_segmentation",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_surface_normal_estimation",
+ "score": 0.3095238095238095,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_depth_estimation",
+ "score": 0.3095238095238095,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_openable_part_segmentation",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_correspondance_in_two_images",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_circle_reasoning",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vln_tegulu_next_step",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_factual_pref",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_polyp_segmentation_single_object_rater",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "recipe_image_ordering",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_visualization_output_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 10,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_view",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video_grounding_temporal",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.07551020408163266,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_english_next_step",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "logical_reasoning_2d_folding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.11764705882352941,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "worldle",
+ "score": 0.26066139706079494,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_keywords_based_retrieval_non_radiology",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sign_language",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_camera_motion_description",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_segments_reordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.7809523809523808,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_spatial",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_match_problem",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "music_sheet_sentiment",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_visual_pref",
+ "score": 0.625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.4093154979121689,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "semantic_matching_of_two_images",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.12244897959183673,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_retrieval",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_recommendation",
+ "score": 0.48435374149659866,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "photoshop_operation",
+ "score": 0.22976190476190478,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_homepage_profile",
+ "score": 0.11518481518481517,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vln_hindi_next_step",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "media_recommend_solutions_stackoverflow",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "dish_ingredient_match",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "google_streetview_direction_understanding",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "music_sheet_name",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "code_translation_easy",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ancient_map_understanding",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_2D_views_of_3D_shapes",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "paper_review_acceptance",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "rocks_samples_compare",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_line_reasoning",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_intent_recognition",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "counting",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_solution_compare",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "2d_image_jigsaw_puzzle_easy",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_content_based_retrieval_radiology",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.1717171717171717,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiview_reasoning_camera_moving",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rocks_samples_identify",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.47892156862745094,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_rating",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_grippers",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "comic_page_ordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_event",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.5792494908838608,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_robot",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "multilingual_news_qa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "move_pos_to_pos_hanoi_4_pole",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting_multi_image",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_translation_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.38690476190476186,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_sign_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.4523809523809523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_author",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.027035301548183985,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_action_recognition",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "functionality_matching_in_different_objects",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "Movie_retrieval_by_actor",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "TV_show_retrieval_by_character",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "Forensic_Detection_of_different_images",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pokemon_3D_recognition",
+ "score": 0.4166666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_rating",
+ "score": 0.6370339174257883,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_basic",
+ "score": 0.4117647058823529,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "waldo",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_line_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_eval_dynamic_pref",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.3684210526315789,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_Python",
+ "score": 0.10416666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.7647058823529411,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_parasite_detection",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.35294117647058826,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_location",
+ "score": 0.20606060606060606,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "rebus",
+ "score": 0.21739130434782608,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat_position_only_length",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_xor_images",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_images",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_position_only",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_contain_images",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.7857142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_tiktok",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_notes",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_alipay",
+ "score": 0.17647058823529413,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_amazon",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_instagram",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_ppt",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_leetcode",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_twitter",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_zoom",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_iphone_settings",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_youtube",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_excel",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_word",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.15714285714285717,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_abdomen_MRI_organ_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_abdomen_endscopy_organ_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_retrieval_given_surgeon_activity",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Videos",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.5764705882352942,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_T2I_reward",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_motion_matching_3D_real",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "video_motion_matching_real_3D",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ball_cup_swap_3",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.3096938775510204,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.37069160997732437,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_subject",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_control",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_aesthetics",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_mask",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_semantics",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_motion_guided_editing",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "autorater_3d_model_texturing",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_unmask",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.05,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.4117647058823529,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_output_result",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.5789473684210527,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_add_tag",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "painting_QA",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.15999999999999998,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.38571428571428573,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.6142857142857144,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.8928571428571431,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.6857142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.7758620689655171,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "doc_vqa",
+ "score": 0.6749999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "generated_video_artifacts",
+ "score": 0.25625000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "sceneqa_scene_transition_video",
+ "score": 0.24285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "nextqa_oe",
+ "score": 0.31052631578947365,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.42857142857142855,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.789655172413793,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.5428571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.7724137931034479,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_content_follow_up",
+ "score": 0.8285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.793103448275862,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "wikihow_complex_task_completion",
+ "score": 0.6222222222222222,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.7285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_detail_description",
+ "score": 0.20526315789473687,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video2notes",
+ "score": 0.4928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "funqa_unexpected_action_magic_video",
+ "score": 0.36,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_qa",
+ "score": 0.7928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.5586206896551723,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_summary",
+ "score": 0.5642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.4034482758620689,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_creative_video",
+ "score": 0.21333333333333335,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.7689655172413793,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.36451612903225805,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.7551724137931034,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.5714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.6266666666666667,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_humor_video",
+ "score": 0.18000000000000005,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_short_title",
+ "score": 0.6071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.6423076923076925,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.711111111111111,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "activitynetqa",
+ "score": 0.41578947368421054,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.31428571428571433,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.7724137931034484,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.8421052631578947,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.6000000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_writing",
+ "score": 0.45999999999999996,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.2571428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.27142857142857146,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.15,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.5071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.46428571428571425,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.3785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.35714285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.5214285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.42142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.42857142857142866,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_B",
+ "score": 0.905263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.6850000000000002,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.8400000000000002,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_A",
+ "score": 0.905263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.7200000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8842105263157897,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.6800000000000002,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Spanish",
+ "score": 0.2857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Arabic",
+ "score": 0.2571428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Russian",
+ "score": 0.29285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_French",
+ "score": 0.3071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Chinese",
+ "score": 0.33571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.07142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/Default/InternVL2_8B/summary_results.json b/static/eval_results/Default/InternVL2_8B/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..6d91119c9457eff407742939ac2882586060f469
--- /dev/null
+++ b/static/eval_results/Default/InternVL2_8B/summary_results.json
@@ -0,0 +1,251 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 440,
+ "num_eval_samples": 6539,
+ "macro_mean_score": 0.25956581776451815,
+ "micro_mean_score": 0.2546984460483302
+ },
+ "open": {
+ "num_eval_tasks": 65,
+ "num_eval_samples": 1165,
+ "macro_mean_score": 0.3978571701460552,
+ "micro_mean_score": 0.4108583690987125
+ },
+ "overall_score": 0.2773656948037259
+ },
+ "keyword_stats": {
+ "skills": {
+ "Object Recognition and Classification": {
+ "count": 303,
+ "num_samples": 4755,
+ "tasks": [],
+ "average_score": 0.2817247716997634
+ },
+ "Text Recognition (OCR)": {
+ "count": 137,
+ "num_samples": 2239,
+ "tasks": [],
+ "average_score": 0.280559214034858
+ },
+ "Language Understanding and Generation": {
+ "count": 154,
+ "num_samples": 2511,
+ "tasks": [],
+ "average_score": 0.32020728060179815
+ },
+ "Scene and Event Understanding": {
+ "count": 154,
+ "num_samples": 2469,
+ "tasks": [],
+ "average_score": 0.325593535916075
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 109,
+ "num_samples": 1910,
+ "tasks": [],
+ "average_score": 0.24118253695139918
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 51,
+ "num_samples": 855,
+ "tasks": [],
+ "average_score": 0.39684007367798446
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 15,
+ "num_samples": 245,
+ "tasks": [],
+ "average_score": 0.4700852130325815
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 77,
+ "num_samples": 1386,
+ "tasks": [],
+ "average_score": 0.27052668526005397
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 152,
+ "num_samples": 2439,
+ "tasks": [],
+ "average_score": 0.23189345356483618
+ },
+ "Planning and Decision Making": {
+ "count": 37,
+ "num_samples": 577,
+ "tasks": [],
+ "average_score": 0.08260405712900723
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 93,
+ "num_samples": 1517,
+ "tasks": [],
+ "average_score": 0.22800928556370195
+ },
+ "Text-Based Images and Documents": {
+ "count": 82,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.2013779290163996
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 101,
+ "num_samples": 1718,
+ "tasks": [],
+ "average_score": 0.2804429603269583
+ },
+ "Videos": {
+ "count": 43,
+ "num_samples": 700,
+ "tasks": [],
+ "average_score": 0.34791358240562653
+ },
+ "Artistic and Creative Content": {
+ "count": 32,
+ "num_samples": 541,
+ "tasks": [],
+ "average_score": 0.2942163420306113
+ },
+ "Photographs": {
+ "count": 143,
+ "num_samples": 2248,
+ "tasks": [],
+ "average_score": 0.3388056726588417
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 11,
+ "num_samples": 169,
+ "tasks": [],
+ "average_score": 0.10933317885944857
+ }
+ },
+ "output_format": {
+ "contextual_formatted_text": {
+ "count": 98,
+ "num_samples": 1514,
+ "tasks": [],
+ "average_score": 0.250804626773504
+ },
+ "structured_output": {
+ "count": 110,
+ "num_samples": 1714,
+ "tasks": [],
+ "average_score": 0.2522493284864019
+ },
+ "exact_text": {
+ "count": 83,
+ "num_samples": 1278,
+ "tasks": [],
+ "average_score": 0.27414636444623874
+ },
+ "numerical_data": {
+ "count": 49,
+ "num_samples": 862,
+ "tasks": [],
+ "average_score": 0.22381302045502052
+ },
+ "open_ended_output": {
+ "count": 80,
+ "num_samples": 1456,
+ "tasks": [],
+ "average_score": 0.3537549824897016
+ },
+ "multiple_choice": {
+ "count": 85,
+ "num_samples": 1363,
+ "tasks": [],
+ "average_score": 0.30261189962428353
+ }
+ },
+ "input_num": {
+ "6-8 images": {
+ "count": 21,
+ "num_samples": 314,
+ "tasks": [],
+ "average_score": 0.15434618291761149
+ },
+ "9-image or more": {
+ "count": 41,
+ "num_samples": 623,
+ "tasks": [],
+ "average_score": 0.19872104324302098
+ },
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.30088711082969344
+ },
+ "video": {
+ "count": 43,
+ "num_samples": 700,
+ "tasks": [],
+ "average_score": 0.34791358240562653
+ },
+ "4-5 images": {
+ "count": 34,
+ "num_samples": 520,
+ "tasks": [],
+ "average_score": 0.17725087609332119
+ },
+ "2-3 images": {
+ "count": 51,
+ "num_samples": 802,
+ "tasks": [],
+ "average_score": 0.2532272454839157
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 72,
+ "num_samples": 1124,
+ "tasks": [],
+ "average_score": 0.29129840423784176
+ },
+ "Planning": {
+ "count": 78,
+ "num_samples": 1239,
+ "tasks": [],
+ "average_score": 0.12166926715781588
+ },
+ "Coding": {
+ "count": 31,
+ "num_samples": 474,
+ "tasks": [],
+ "average_score": 0.24700310231619527
+ },
+ "Perception": {
+ "count": 145,
+ "num_samples": 2315,
+ "tasks": [],
+ "average_score": 0.3214666523378005
+ },
+ "Metrics": {
+ "count": 20,
+ "num_samples": 309,
+ "tasks": [],
+ "average_score": 0.3995660275981844
+ },
+ "Science": {
+ "count": 29,
+ "num_samples": 574,
+ "tasks": [],
+ "average_score": 0.24614711281861912
+ },
+ "Knowledge": {
+ "count": 97,
+ "num_samples": 1605,
+ "tasks": [],
+ "average_score": 0.3393895915929317
+ },
+ "Mathematics": {
+ "count": 33,
+ "num_samples": 547,
+ "tasks": [],
+ "average_score": 0.22078333222564453
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/Default/InternVL2_8B/task_results.json b/static/eval_results/Default/InternVL2_8B/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..a948f5adb69fc399f963cb90ab13f6eac750b8c4
--- /dev/null
+++ b/static/eval_results/Default/InternVL2_8B/task_results.json
@@ -0,0 +1,7756 @@
+[
+ {
+ "name": "functionality_matching_in_different_objects",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "paper_review_rating",
+ "score": 0.7270348376779725,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rocks_samples_identify",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "sign_language",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "2d_image_jigsaw_puzzle_easy",
+ "score": 0.20595238095238097,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "booking_web_recommendation",
+ "score": 0.28279478458049884,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.29411764705882354,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_keywords_based_retrieval_non_radiology",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_hindi_next_step",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.6019887092978411,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_temporal",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_rating",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "medical_parasite_detection",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_visualization_output_understanding",
+ "score": 0.1,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 10,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.15,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.035313785427807685,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_advanced",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiview_reasoning_camera_moving",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.042857142857142864,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_action_recognition",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "music_sheet_author",
+ "score": 0.3125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "rocks_samples_compare",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "comic_page_ordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.8821428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_news_qa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "paper_review_acceptance",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "counting_multi_image",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "photoshop_operation",
+ "score": 0.19642857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "vln_identify_robot",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.7777777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_tegulu_next_step",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.29362745098039217,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_direction_understanding",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "semantic_matching_of_two_images",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.1414141414141414,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.34285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pokemon_3D_recognition",
+ "score": 0.11666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_correspondance_in_two_images",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_Python",
+ "score": 0.08333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_event",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_name",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_content_based_retrieval_radiology",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_segments_reordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rebus",
+ "score": 0.08695652173913043,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.007711038961038961,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Movie_retrieval_by_actor",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "music_sheet_sentiment",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "code_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video_intent_recognition",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.3204993605915246,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ancient_map_understanding",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "media_recommend_solutions_stackoverflow",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "vln_english_next_step",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "waldo",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_polyp_segmentation_single_object_rater",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_match_problem",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_solution_compare",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_spatial",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "logical_reasoning_2d_folding",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_reasoning",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "TV_show_retrieval_by_character",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_location",
+ "score": 0.0606060606060606,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "planning_visual_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_sign_recognition",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "Forensic_Detection_of_different_images",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.13157894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_2D_views_of_3D_shapes",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "dish_ingredient_match",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "media_homepage_profile",
+ "score": 0.023809523809523808,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_camera_motion_description",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "video"
+ },
+ {
+ "name": "google_streetview_line_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "move_pos_to_pos_hanoi_4_pole",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_line_reasoning",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_view",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_basic",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.38205263157894737,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.043576951479015184,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.6865079365079365,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_view_identification",
+ "score": 0.04444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.848854419078294,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_vehicle_multiview_counting",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_video_moving_object_property_recognition",
+ "score": 0.6875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "clevrer_video_moving_object_count",
+ "score": 0.19047619047619047,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 21,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "sta_action_localization_video",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_hamiltonian_cycle",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "vizwiz_quality_accessment_for_blind",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.3392857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_check_two_images",
+ "score": 0.08888888888888888,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "dvqa",
+ "score": 0.7368421052631579,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cheapest_flight_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "code_programming_extremely_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.44950911131594296,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_multicamera_tracking_predict_bbox",
+ "score": 0.0320582378164677,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "perception_test_video_character_order",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.7272727272727273,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_mc",
+ "score": 0.8421052631578947,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "algebra",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_sequence",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_similar_scene_retrieval",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "star_object_interaction_video",
+ "score": 0.3125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.15789473684210525,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.21666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_moving_direction_video",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "IAM_line_ocr_and_locate",
+ "score": 0.46903875551438506,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_object_shuffle_video",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "muma_theory_of_mind_social_goal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "muma_theory_of_mind_belief_of_goal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevr_arithmetic",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.32653061224489793,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.39999999999999997,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.08333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.54,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arc_agi",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.41964285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.027174771375503042,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_to_camera_trajectory_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.7394957983193277,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualdial_visual_dialog_image_guessing",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vlnqa_egocentric_navigation_video",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.35555555555555546,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "face_identity_matching",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "music_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "clevrer_object_existence_video",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "cam_traj_to_video_selection",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_content_reasoning",
+ "score": 0.7777777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "3d_fragments_understanding",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.5789473684210527,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.44052631578947365,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_error_line_identification",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_programming_test_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logo2k_same_type_logo_retrieval",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.5533333333333332,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "photo_sharing_image_retrieval",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "emotion_recognition",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_human_multiview_counting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "perception_test_video_action_count",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "stock_info_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.31442052224278927,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_path",
+ "score": 0.3928571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "coco_ood_global_image_retrieval_by_query_property",
+ "score": 0.5184240362811792,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.5382631578947369,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.1,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_T2I_reward",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_motion_matching_real_3D",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "video_motion_matching_3D_real",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.40476190476190477,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_position_images",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_xor_images",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_position_only",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_images",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat_position_only_length",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.3058823529411766,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_youtube",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_iphone_settings",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_notes",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_word",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_alipay",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_twitter",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_leetcode",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_tiktok",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_excel",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_instagram",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_ppt",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_zoom",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_amazon",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_3d_model_texturing",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_subject",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_unmask",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_mask",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_aesthetics",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_motion_guided_editing",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_control",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_semantics",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.11760204081632653,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.15759637188208617,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ball_cup_swap_3",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geographic_remote_sensing_land_cover",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visual_prediction_rater_novel_view_synthesis",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_semantic_segmentation",
+ "score": 0.22916666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_panoptic_segmentation",
+ "score": 0.023809523809523808,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_surface_normal_estimation",
+ "score": 0.023809523809523808,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_openable_part_segmentation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_plane_segmentation",
+ "score": 0.02222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_depth_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_3d_assembled_quality_understanding",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.014285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.05714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "code_translation_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.05555555555555555,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "worldle",
+ "score": 0.09999999999999999,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.018357142857142832,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_load_type_prediction_from_plot",
+ "score": 0.36904761904761907,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.6153571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.5148571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.9305714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "recipe_image_ordering",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.47058823529411764,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_dynamic_pref",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "video_eval_visual_pref",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "video_eval_factual_pref",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_abdomen_endscopy_organ_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_abdomen_MRI_organ_recognition",
+ "score": 0.09523809523809523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_retrieval_given_surgeon_activity",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Videos",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "action_prediction",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_output_result",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.42105263157894735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_add_tag",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "painting_QA",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.5357142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.33103448275862074,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_magic_video",
+ "score": 0.40666666666666657,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.4620689655172414,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.8,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.39310344827586213,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.3448275862068966,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_detail_description",
+ "score": 0.28421052631578947,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.45333333333333325,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video2notes",
+ "score": 0.39999999999999997,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "activitynetqa",
+ "score": 0.431578947368421,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.4034482758620689,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.55,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_qa",
+ "score": 0.6928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.21724137931034476,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.403448275862069,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.8285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.35,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "wikihow_complex_task_completion",
+ "score": 0.3333333333333333,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.4793103448275863,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.2785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "doc_vqa",
+ "score": 0.7875,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.5857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.6653846153846155,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.1896551724137931,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.206,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_creative_video",
+ "score": 0.17333333333333334,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_summary",
+ "score": 0.37142857142857144,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.07142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.613793103448276,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_humor_video",
+ "score": 0.20000000000000007,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "paper_review_writing",
+ "score": 0.5,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.6032258064516127,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.2357142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.40714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "generated_video_artifacts",
+ "score": 0.31250000000000006,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.6222222222222222,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_short_title",
+ "score": 0.5071428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "sceneqa_scene_transition_video",
+ "score": 0.2,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "nextqa_oe",
+ "score": 0.26842105263157895,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_content_follow_up",
+ "score": 0.4714285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.8549999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_B",
+ "score": 0.905263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8684210526315791,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.66,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_A",
+ "score": 0.8421052631578949,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.6199999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.6100000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.1285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.34285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.3857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.4428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.14285714285714288,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.06428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Russian",
+ "score": 0.14285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_French",
+ "score": 0.15714285714285717,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Spanish",
+ "score": 0.12142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Chinese",
+ "score": 0.1142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Arabic",
+ "score": 0.04285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.45000000000000007,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.09285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.4,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/Default/Llama_3_2_11B/summary_results.json b/static/eval_results/Default/Llama_3_2_11B/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b9e128e5c619e8d90b92df12a38760d4d8f440b2
--- /dev/null
+++ b/static/eval_results/Default/Llama_3_2_11B/summary_results.json
@@ -0,0 +1,251 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 440,
+ "num_eval_samples": 6539,
+ "macro_mean_score": 0.15999641916771298,
+ "micro_mean_score": 0.15809331016967038
+ },
+ "open": {
+ "num_eval_tasks": 65,
+ "num_eval_samples": 1163,
+ "macro_mean_score": 0.3173342406187366,
+ "micro_mean_score": 0.3487962166809973
+ },
+ "overall_score": 0.1802478219287358
+ },
+ "keyword_stats": {
+ "skills": {
+ "Object Recognition and Classification": {
+ "count": 303,
+ "num_samples": 4755,
+ "tasks": [],
+ "average_score": 0.1907604552173455
+ },
+ "Text Recognition (OCR)": {
+ "count": 137,
+ "num_samples": 2239,
+ "tasks": [],
+ "average_score": 0.14328677752263275
+ },
+ "Language Understanding and Generation": {
+ "count": 154,
+ "num_samples": 2509,
+ "tasks": [],
+ "average_score": 0.19646404502647707
+ },
+ "Scene and Event Understanding": {
+ "count": 154,
+ "num_samples": 2467,
+ "tasks": [],
+ "average_score": 0.22399113135844315
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 109,
+ "num_samples": 1910,
+ "tasks": [],
+ "average_score": 0.13303760019716085
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 51,
+ "num_samples": 855,
+ "tasks": [],
+ "average_score": 0.323153603297999
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 15,
+ "num_samples": 245,
+ "tasks": [],
+ "average_score": 0.4260501253132832
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 77,
+ "num_samples": 1386,
+ "tasks": [],
+ "average_score": 0.1770852858056774
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 152,
+ "num_samples": 2437,
+ "tasks": [],
+ "average_score": 0.15366454315378308
+ },
+ "Planning and Decision Making": {
+ "count": 37,
+ "num_samples": 577,
+ "tasks": [],
+ "average_score": 0.06563884729522687
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 93,
+ "num_samples": 1517,
+ "tasks": [],
+ "average_score": 0.11886347847341794
+ },
+ "Text-Based Images and Documents": {
+ "count": 82,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.11489351406848371
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 101,
+ "num_samples": 1718,
+ "tasks": [],
+ "average_score": 0.1693681214060816
+ },
+ "Videos": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.2123769209846321
+ },
+ "Artistic and Creative Content": {
+ "count": 32,
+ "num_samples": 541,
+ "tasks": [],
+ "average_score": 0.2520175802062012
+ },
+ "Photographs": {
+ "count": 143,
+ "num_samples": 2248,
+ "tasks": [],
+ "average_score": 0.2485354956932213
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 11,
+ "num_samples": 169,
+ "tasks": [],
+ "average_score": 0.06418655520777307
+ }
+ },
+ "output_format": {
+ "contextual_formatted_text": {
+ "count": 98,
+ "num_samples": 1514,
+ "tasks": [],
+ "average_score": 0.12417283740525839
+ },
+ "structured_output": {
+ "count": 110,
+ "num_samples": 1714,
+ "tasks": [],
+ "average_score": 0.16374180545556977
+ },
+ "exact_text": {
+ "count": 83,
+ "num_samples": 1278,
+ "tasks": [],
+ "average_score": 0.1576236804437753
+ },
+ "numerical_data": {
+ "count": 49,
+ "num_samples": 862,
+ "tasks": [],
+ "average_score": 0.15014439824913947
+ },
+ "open_ended_output": {
+ "count": 80,
+ "num_samples": 1454,
+ "tasks": [],
+ "average_score": 0.3003142292328822
+ },
+ "multiple_choice": {
+ "count": 85,
+ "num_samples": 1363,
+ "tasks": [],
+ "average_score": 0.19270157739425633
+ }
+ },
+ "input_num": {
+ "6-8 images": {
+ "count": 21,
+ "num_samples": 314,
+ "tasks": [],
+ "average_score": 0.1463246409674981
+ },
+ "9-image or more": {
+ "count": 41,
+ "num_samples": 623,
+ "tasks": [],
+ "average_score": 0.0732004839476103
+ },
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.1960107191983825
+ },
+ "video": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.2123769209846321
+ },
+ "4-5 images": {
+ "count": 34,
+ "num_samples": 520,
+ "tasks": [],
+ "average_score": 0.1351857051327849
+ },
+ "2-3 images": {
+ "count": 51,
+ "num_samples": 802,
+ "tasks": [],
+ "average_score": 0.18586695387250338
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 72,
+ "num_samples": 1124,
+ "tasks": [],
+ "average_score": 0.17288724679416761
+ },
+ "Planning": {
+ "count": 78,
+ "num_samples": 1239,
+ "tasks": [],
+ "average_score": 0.08100042975820579
+ },
+ "Coding": {
+ "count": 31,
+ "num_samples": 474,
+ "tasks": [],
+ "average_score": 0.0575426944971537
+ },
+ "Perception": {
+ "count": 145,
+ "num_samples": 2313,
+ "tasks": [],
+ "average_score": 0.19899465185565898
+ },
+ "Metrics": {
+ "count": 20,
+ "num_samples": 309,
+ "tasks": [],
+ "average_score": 0.254316961351997
+ },
+ "Science": {
+ "count": 29,
+ "num_samples": 574,
+ "tasks": [],
+ "average_score": 0.162801811963855
+ },
+ "Knowledge": {
+ "count": 97,
+ "num_samples": 1605,
+ "tasks": [],
+ "average_score": 0.28055776664538923
+ },
+ "Mathematics": {
+ "count": 33,
+ "num_samples": 547,
+ "tasks": [],
+ "average_score": 0.13937853323074623
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/Default/Llama_3_2_11B/task_results.json b/static/eval_results/Default/Llama_3_2_11B/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..54bb871b50f68090ff56c48c2980c262ea44e0e0
--- /dev/null
+++ b/static/eval_results/Default/Llama_3_2_11B/task_results.json
@@ -0,0 +1,7756 @@
+[
+ {
+ "name": "face_identity_matching",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "IAM_line_ocr_and_locate",
+ "score": 0.20517362180506796,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_multicamera_tracking_predict_bbox",
+ "score": 0.0320582378164677,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevr_arithmetic",
+ "score": 0.21052631578947367,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.21666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualdial_visual_dialog_image_guessing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_object_existence_video",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.18367346938775506,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "star_object_interaction_video",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_content_reasoning",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_similar_scene_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cam_traj_to_video_selection",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "photo_sharing_image_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "perception_test_video_character_order",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "coco_ood_global_image_retrieval_by_query_property",
+ "score": 0.23051948051948054,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "arc_agi",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "vizwiz_quality_accessment_for_blind",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "av_human_multiview_counting",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "emotion_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.46,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.13514770116170818,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.009133339778501075,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "dvqa",
+ "score": 0.42105263157894735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.3577885451105633,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_moving_direction_video",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.7012987012987012,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_cycle",
+ "score": 0.33035714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "av_vehicle_multiview_counting",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.056101443718362946,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.05999999999999999,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sta_action_localization_video",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.056296198118786375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_object_shuffle_video",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "video_to_camera_trajectory_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "cheapest_flight_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "muma_theory_of_mind_social_goal",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "clevrer_video_moving_object_property_recognition",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "action_sequence",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "game_info_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "av_view_identification",
+ "score": 0.08888888888888889,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "code_programming_test_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "muma_theory_of_mind_belief_of_goal",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "code_programming_extremely_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.8025210084033614,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_fragments_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.04578947368421055,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.2301587301587302,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.3504119352087328,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vlnqa_egocentric_navigation_video",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "landmark_check_two_images",
+ "score": 0.4666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.015315789473684218,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.039,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_load_type_prediction_from_plot",
+ "score": 0.05952380952380952,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.5571428571428572,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "code_error_line_identification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.2767857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.3968253968253968,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_mc",
+ "score": 0.631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_retrieval",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_video_moving_object_count",
+ "score": 0.38095238095238093,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 21,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "algebra",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.6037857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.10771428571428569,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logo2k_same_type_logo_retrieval",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.09968483232794643,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.1965947368421053,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_path",
+ "score": 0.48214285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.10526315789473684,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.11607142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.4047619047619047,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_video_action_count",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.017857142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.3111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geographic_remote_sensing_land_cover",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.05555555555555555,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.042857142857142864,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.042857142857142864,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.014285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_plane_segmentation",
+ "score": 0.04444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_novel_view_synthesis",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_3d_assembled_quality_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visual_prediction_rater_semantic_segmentation",
+ "score": 0.020833333333333332,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_panoptic_segmentation",
+ "score": 0.047619047619047616,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_surface_normal_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_depth_estimation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_openable_part_segmentation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_abdomen_MRI_organ_recognition",
+ "score": 0.047619047619047616,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_correspondance_in_two_images",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_circle_reasoning",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vln_tegulu_next_step",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_factual_pref",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_polyp_segmentation_single_object_rater",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "recipe_image_ordering",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_visualization_output_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 10,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_view",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video_grounding_temporal",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.06598639455782314,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_blocksworld",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.21429807404733187,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_english_next_step",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "logical_reasoning_2d_folding",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_abdomen_endscopy_organ_recognition",
+ "score": 0.023809523809523808,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "worldle",
+ "score": 0.13949694490239214,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_keywords_based_retrieval_non_radiology",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sign_language",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_camera_motion_description",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_segments_reordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.7452380952380953,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_spatial",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_match_problem",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "music_sheet_sentiment",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.04081632653061224,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_visual_pref",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.04695531121001506,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.3,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "semantic_matching_of_two_images",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.02040816326530612,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.15789473684210525,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_recommendation",
+ "score": 0.42823129251700676,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "photoshop_operation",
+ "score": 0.15357142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_homepage_profile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vln_hindi_next_step",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "media_recommend_solutions_stackoverflow",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "dish_ingredient_match",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "medical_retrieval_given_surgeon_activity",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Videos",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "google_streetview_direction_understanding",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "music_sheet_name",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "code_translation_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ancient_map_understanding",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_2D_views_of_3D_shapes",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "paper_review_acceptance",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "rocks_samples_compare",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_line_reasoning",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_intent_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "counting",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.33333333333333337,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_solution_compare",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "2d_image_jigsaw_puzzle_easy",
+ "score": 0.028571428571428574,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_content_based_retrieval_radiology",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.030303030303030304,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiview_reasoning_camera_moving",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rocks_samples_identify",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.1691176470588235,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_rating",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_grippers",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "comic_page_ordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_event",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.4372276395262732,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_robot",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "multilingual_news_qa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "move_pos_to_pos_hanoi_4_pole",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting_multi_image",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_translation_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.06547619047619048,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_sign_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.2380952380952381,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_author",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.028492188570677254,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_action_recognition",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "functionality_matching_in_different_objects",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "Movie_retrieval_by_actor",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "TV_show_retrieval_by_character",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "Forensic_Detection_of_different_images",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pokemon_3D_recognition",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_rating",
+ "score": 0.5778868460875585,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "number_comparison",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_basic",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "waldo",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_line_sorting",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_eval_dynamic_pref",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.16666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_Python",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.5294117647058824,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.18571428571428572,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_parasite_detection",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_location",
+ "score": 0.16969696969696965,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "rebus",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat_position_only_length",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_xor_images",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_images",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_position_only",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_contain_images",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.5476190476190477,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_tiktok",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_notes",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_alipay",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_amazon",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_instagram",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_ppt",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_leetcode",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_twitter",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_zoom",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_iphone_settings",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_youtube",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_excel",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_word",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.13571428571428573,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6823529411764707,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_T2I_reward",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_motion_matching_3D_real",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "video_motion_matching_real_3D",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ball_cup_swap_3",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.14540816326530612,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.09268707482993196,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_subject",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_control",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_aesthetics",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_mask",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_semantics",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_motion_guided_editing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "autorater_3d_model_texturing",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_unmask",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_output_result",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.3157894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_add_tag",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "painting_QA",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.154,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.07857142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.09285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.17857142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.6241379310344828,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "doc_vqa",
+ "score": 0.7625000000000002,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "generated_video_artifacts",
+ "score": 0.3625,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "sceneqa_scene_transition_video",
+ "score": 0.16428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "nextqa_oe",
+ "score": 0.32631578947368417,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.25,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.6,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.3714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.49310344827586217,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_content_follow_up",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.5896551724137931,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "wikihow_complex_task_completion",
+ "score": 0.17777777777777776,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.2928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_detail_description",
+ "score": 0.12105263157894736,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video2notes",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "funqa_unexpected_action_magic_video",
+ "score": 0.2533333333333333,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_qa",
+ "score": 0.3357142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.3931034482758621,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_summary",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.34482758620689646,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_creative_video",
+ "score": 0.25999999999999995,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.4482758620689656,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.3709677419354839,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.18620689655172415,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.5428571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.5066666666666667,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_humor_video",
+ "score": 0.17333333333333334,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_short_title",
+ "score": 0.10714285714285716,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.39230769230769236,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.07777777777777778,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "activitynetqa",
+ "score": 0.22631578947368425,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.5142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.3620689655172414,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.5894736842105264,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.23448275862068965,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_writing",
+ "score": 0.13999999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.44285714285714295,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.47142857142857153,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.2285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.049999999999999996,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.20714285714285716,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.03571428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.1857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_B",
+ "score": 0.8526315789473685,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.7,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.9,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_A",
+ "score": 0.8631578947368422,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.675,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.7578947368421054,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.8150000000000002,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Spanish",
+ "score": 0.24285714285714283,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Arabic",
+ "score": 0.25,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Russian",
+ "score": 0.19285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_French",
+ "score": 0.1714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Chinese",
+ "score": 0.24285714285714288,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.14285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/Default/Mammoth_VL/summary_results.json b/static/eval_results/Default/Mammoth_VL/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..6b2cc0baf5cb8d8e9cfd4184289f35fba2e6c779
--- /dev/null
+++ b/static/eval_results/Default/Mammoth_VL/summary_results.json
@@ -0,0 +1,251 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 440,
+ "num_eval_samples": 6539,
+ "macro_mean_score": 0.264052880412689,
+ "micro_mean_score": 0.2626894374387823
+ },
+ "open": {
+ "num_eval_tasks": 65,
+ "num_eval_samples": 1163,
+ "macro_mean_score": 0.37992668750165337,
+ "micro_mean_score": 0.40120378331900275
+ },
+ "overall_score": 0.27896733083008046
+ },
+ "keyword_stats": {
+ "skills": {
+ "Object Recognition and Classification": {
+ "count": 303,
+ "num_samples": 4755,
+ "tasks": [],
+ "average_score": 0.30194776127683565
+ },
+ "Text Recognition (OCR)": {
+ "count": 137,
+ "num_samples": 2239,
+ "tasks": [],
+ "average_score": 0.2365295791606494
+ },
+ "Language Understanding and Generation": {
+ "count": 154,
+ "num_samples": 2509,
+ "tasks": [],
+ "average_score": 0.2993927028494267
+ },
+ "Scene and Event Understanding": {
+ "count": 154,
+ "num_samples": 2467,
+ "tasks": [],
+ "average_score": 0.3366347826116991
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 109,
+ "num_samples": 1910,
+ "tasks": [],
+ "average_score": 0.2408454736444444
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 51,
+ "num_samples": 855,
+ "tasks": [],
+ "average_score": 0.37895522991264047
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 15,
+ "num_samples": 245,
+ "tasks": [],
+ "average_score": 0.48003508771929826
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 77,
+ "num_samples": 1386,
+ "tasks": [],
+ "average_score": 0.27232427744946475
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 152,
+ "num_samples": 2437,
+ "tasks": [],
+ "average_score": 0.24522937191710698
+ },
+ "Planning and Decision Making": {
+ "count": 37,
+ "num_samples": 577,
+ "tasks": [],
+ "average_score": 0.11457024299726488
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 93,
+ "num_samples": 1517,
+ "tasks": [],
+ "average_score": 0.18941525254390731
+ },
+ "Text-Based Images and Documents": {
+ "count": 82,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.1718334741390191
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 101,
+ "num_samples": 1718,
+ "tasks": [],
+ "average_score": 0.28108187023954245
+ },
+ "Videos": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.3391119999611432
+ },
+ "Artistic and Creative Content": {
+ "count": 32,
+ "num_samples": 541,
+ "tasks": [],
+ "average_score": 0.36434285930327387
+ },
+ "Photographs": {
+ "count": 143,
+ "num_samples": 2248,
+ "tasks": [],
+ "average_score": 0.36915384448504296
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 11,
+ "num_samples": 169,
+ "tasks": [],
+ "average_score": 0.15940750469262005
+ }
+ },
+ "output_format": {
+ "contextual_formatted_text": {
+ "count": 98,
+ "num_samples": 1514,
+ "tasks": [],
+ "average_score": 0.2456942956200745
+ },
+ "structured_output": {
+ "count": 110,
+ "num_samples": 1714,
+ "tasks": [],
+ "average_score": 0.21586513216389874
+ },
+ "exact_text": {
+ "count": 83,
+ "num_samples": 1278,
+ "tasks": [],
+ "average_score": 0.29359048024032264
+ },
+ "numerical_data": {
+ "count": 49,
+ "num_samples": 862,
+ "tasks": [],
+ "average_score": 0.2646677074112521
+ },
+ "open_ended_output": {
+ "count": 80,
+ "num_samples": 1454,
+ "tasks": [],
+ "average_score": 0.34733130661096645
+ },
+ "multiple_choice": {
+ "count": 85,
+ "num_samples": 1363,
+ "tasks": [],
+ "average_score": 0.3286125236284589
+ }
+ },
+ "input_num": {
+ "6-8 images": {
+ "count": 21,
+ "num_samples": 314,
+ "tasks": [],
+ "average_score": 0.16358654572940287
+ },
+ "9-image or more": {
+ "count": 41,
+ "num_samples": 623,
+ "tasks": [],
+ "average_score": 0.25463059203015115
+ },
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.2919119209789575
+ },
+ "video": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.3391119999611432
+ },
+ "4-5 images": {
+ "count": 34,
+ "num_samples": 520,
+ "tasks": [],
+ "average_score": 0.20016011839130254
+ },
+ "2-3 images": {
+ "count": 51,
+ "num_samples": 802,
+ "tasks": [],
+ "average_score": 0.2679179451692527
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 72,
+ "num_samples": 1124,
+ "tasks": [],
+ "average_score": 0.23600902063965679
+ },
+ "Planning": {
+ "count": 78,
+ "num_samples": 1239,
+ "tasks": [],
+ "average_score": 0.15326915093278803
+ },
+ "Coding": {
+ "count": 31,
+ "num_samples": 474,
+ "tasks": [],
+ "average_score": 0.20668466311255687
+ },
+ "Perception": {
+ "count": 145,
+ "num_samples": 2313,
+ "tasks": [],
+ "average_score": 0.33348955971237954
+ },
+ "Metrics": {
+ "count": 20,
+ "num_samples": 309,
+ "tasks": [],
+ "average_score": 0.3759170425350556
+ },
+ "Science": {
+ "count": 29,
+ "num_samples": 574,
+ "tasks": [],
+ "average_score": 0.23894961766260706
+ },
+ "Knowledge": {
+ "count": 97,
+ "num_samples": 1605,
+ "tasks": [],
+ "average_score": 0.351703435685048
+ },
+ "Mathematics": {
+ "count": 33,
+ "num_samples": 547,
+ "tasks": [],
+ "average_score": 0.26074348700688493
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/Default/Mammoth_VL/task_results.json b/static/eval_results/Default/Mammoth_VL/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6bb4d6e41528d198bb96b60aec60cc1550013fd
--- /dev/null
+++ b/static/eval_results/Default/Mammoth_VL/task_results.json
@@ -0,0 +1,7756 @@
+[
+ {
+ "name": "face_identity_matching",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "IAM_line_ocr_and_locate",
+ "score": 0.310472219974674,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_multicamera_tracking_predict_bbox",
+ "score": 0.030820962942379463,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevr_arithmetic",
+ "score": 0.5789473684210527,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.36666666666666664,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualdial_visual_dialog_image_guessing",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_object_existence_video",
+ "score": 0.875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.12244897959183672,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "star_object_interaction_video",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_content_reasoning",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_similar_scene_retrieval",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cam_traj_to_video_selection",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "photo_sharing_image_retrieval",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "perception_test_video_character_order",
+ "score": 0.3125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "coco_ood_global_image_retrieval_by_query_property",
+ "score": 0.3629715522572665,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "arc_agi",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "vizwiz_quality_accessment_for_blind",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "av_human_multiview_counting",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "emotion_recognition",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.56,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.041666666666666664,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.34586160183944936,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.11160580091854058,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "dvqa",
+ "score": 0.7894736842105263,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.5818241712483578,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_moving_direction_video",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.2792207792207792,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_cycle",
+ "score": 0.3392857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "av_vehicle_multiview_counting",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.21513761521109695,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.5893333333333335,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sta_action_localization_video",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.7907526968967371,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_object_shuffle_video",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_to_camera_trajectory_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "cheapest_flight_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.33333333333333326,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "muma_theory_of_mind_social_goal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "clevrer_video_moving_object_property_recognition",
+ "score": 0.8125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "action_sequence",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "game_info_retrieval",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "av_view_identification",
+ "score": 0.08888888888888888,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "code_programming_test_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "muma_theory_of_mind_belief_of_goal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "code_programming_extremely_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.06722689075630253,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.6842105263157895,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_fragments_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.5094736842105263,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.253968253968254,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.5344949749908332,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vlnqa_egocentric_navigation_video",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "landmark_check_two_images",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_output_result",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.5191578947368422,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.3442857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.5789473684210527,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_load_type_prediction_from_plot",
+ "score": 0.4523809523809523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.028571428571428574,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_prediction",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "code_error_line_identification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.6875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.15178571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.4087301587301588,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_mc",
+ "score": 0.7894736842105263,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_video_moving_object_count",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 21,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "algebra",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.9375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.5852857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.35735714285714293,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logo2k_same_type_logo_retrieval",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.17949711661260873,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.4398578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_path",
+ "score": 0.40476190476190477,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.42105263157894735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_video_action_count",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.16071428571428573,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.3777777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geographic_remote_sensing_land_cover",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.16666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.1142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.1571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.09999999999999999,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.1285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_plane_segmentation",
+ "score": 0.15555555555555553,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_novel_view_synthesis",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_3d_assembled_quality_understanding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visual_prediction_rater_semantic_segmentation",
+ "score": 0.12499999999999999,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_panoptic_segmentation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_surface_normal_estimation",
+ "score": 0.2619047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_depth_estimation",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_openable_part_segmentation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_abdomen_MRI_organ_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_correspondance_in_two_images",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_circle_reasoning",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vln_tegulu_next_step",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_factual_pref",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_polyp_segmentation_single_object_rater",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "recipe_image_ordering",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_visualization_output_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 10,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_view",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video_grounding_temporal",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.1064625850340136,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_blocksworld",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.7777777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_english_next_step",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "logical_reasoning_2d_folding",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_abdomen_endscopy_organ_recognition",
+ "score": 0.09523809523809523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "worldle",
+ "score": 0.16417078019580122,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_keywords_based_retrieval_non_radiology",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sign_language",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_camera_motion_description",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_segments_reordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.0020726462511010965,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_spatial",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_match_problem",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "music_sheet_sentiment",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.04081632653061224,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_visual_pref",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.22423936011541246,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.3,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "semantic_matching_of_two_images",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.061224489795918366,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_retrieval",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.6842105263157895,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_recommendation",
+ "score": 0.3815192743764172,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "photoshop_operation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_homepage_profile",
+ "score": 0.005952380952380952,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vln_hindi_next_step",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "media_recommend_solutions_stackoverflow",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "dish_ingredient_match",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "medical_retrieval_given_surgeon_activity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Videos",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "google_streetview_direction_understanding",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "music_sheet_name",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "code_translation_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ancient_map_understanding",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_add_tag",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_2D_views_of_3D_shapes",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "paper_review_acceptance",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "rocks_samples_compare",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_line_reasoning",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_intent_recognition",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "counting",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.42857142857142866,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_solution_compare",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "2d_image_jigsaw_puzzle_easy",
+ "score": 0.15,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_content_based_retrieval_radiology",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.18181818181818182,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiview_reasoning_camera_moving",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rocks_samples_identify",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.3245098039215687,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_rating",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_grippers",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "comic_page_ordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_event",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.5792190745087759,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_robot",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "multilingual_news_qa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "move_pos_to_pos_hanoi_4_pole",
+ "score": 0.0503968253968254,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting_multi_image",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_translation_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_sign_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.2380952380952381,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "painting_QA",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "music_sheet_author",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.029659344301921898,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_action_recognition",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "functionality_matching_in_different_objects",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "Movie_retrieval_by_actor",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "TV_show_retrieval_by_character",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "Forensic_Detection_of_different_images",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pokemon_3D_recognition",
+ "score": 0.11666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_rating",
+ "score": 0.7013765649868268,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_basic",
+ "score": 0.8823529411764706,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "waldo",
+ "score": 0.0005013842565343441,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_line_sorting",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_eval_dynamic_pref",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.10526315789473684,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_Python",
+ "score": 0.08333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.35294117647058826,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.3857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_parasite_detection",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.17647058823529413,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_location",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "rebus",
+ "score": 0.08695652173913043,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat_position_only_length",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_xor_images",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_images",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_position_only",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_contain_images",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_tiktok",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_notes",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_alipay",
+ "score": 0.4117647058823529,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_amazon",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_instagram",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_ppt",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_leetcode",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_twitter",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_zoom",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_iphone_settings",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_youtube",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_excel",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_word",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.014285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6705882352941178,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_T2I_reward",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.5952380952380952,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_motion_matching_3D_real",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "video_motion_matching_real_3D",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ball_cup_swap_3",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.2636904761904762,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.23069727891156463,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.1,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_subject",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_control",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_aesthetics",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_mask",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_semantics",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_motion_guided_editing",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "autorater_3d_model_texturing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_unmask",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.05,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.17647058823529413,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.08800000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.27142857142857146,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.6285714285714284,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.7500000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.4214285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.4862068965517241,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "doc_vqa",
+ "score": 0.51875,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "generated_video_artifacts",
+ "score": 0.09375000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "sceneqa_scene_transition_video",
+ "score": 0.19285714285714284,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "nextqa_oe",
+ "score": 0.33157894736842103,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.45,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.593103448275862,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.40714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.6724137931034483,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_content_follow_up",
+ "score": 0.5642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.6965517241379311,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "wikihow_complex_task_completion",
+ "score": 0.8111111111111112,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.6714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_detail_description",
+ "score": 0.34736842105263155,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video2notes",
+ "score": 0.17142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "funqa_unexpected_action_magic_video",
+ "score": 0.29999999999999993,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_qa",
+ "score": 0.3285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.40689655172413797,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_summary",
+ "score": 0.3928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.2344827586206896,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_creative_video",
+ "score": 0.22000000000000006,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.506896551724138,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.3838709677419356,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.33103448275862074,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.1285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.45999999999999996,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_humor_video",
+ "score": 0.26666666666666666,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_short_title",
+ "score": 0.6,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.5307692307692307,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.3777777777777777,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "activitynetqa",
+ "score": 0.5421052631578946,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.25000000000000006,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.4724137931034483,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.7999999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.3241379310344827,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_writing",
+ "score": 0.4333333333333334,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.03571428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.45000000000000007,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.1285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.028571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.0071428571428571435,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.0071428571428571435,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.014285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.32857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_B",
+ "score": 0.9157894736842105,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.615,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.16315789473684214,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.7700000000000002,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_A",
+ "score": 0.8736842105263158,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.675,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.9105263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.4,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Spanish",
+ "score": 0.15,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Arabic",
+ "score": 0.07857142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Russian",
+ "score": 0.09285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_French",
+ "score": 0.12142857142857144,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Chinese",
+ "score": 0.2571428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.21428571428571427,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/Default/MiniCPM_v2.6/summary_results.json b/static/eval_results/Default/MiniCPM_v2.6/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..636b1496744d89284ea5089d88cce3d34abddac2
--- /dev/null
+++ b/static/eval_results/Default/MiniCPM_v2.6/summary_results.json
@@ -0,0 +1,251 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 440,
+ "num_eval_samples": 6539,
+ "macro_mean_score": 0.22955895202146906,
+ "micro_mean_score": 0.22560399396899078
+ },
+ "open": {
+ "num_eval_tasks": 65,
+ "num_eval_samples": 1163,
+ "macro_mean_score": 0.41728623355613875,
+ "micro_mean_score": 0.43452278589853827
+ },
+ "overall_score": 0.2537218694467236
+ },
+ "keyword_stats": {
+ "skills": {
+ "Object Recognition and Classification": {
+ "count": 303,
+ "num_samples": 4755,
+ "tasks": [],
+ "average_score": 0.2604967101191775
+ },
+ "Text Recognition (OCR)": {
+ "count": 137,
+ "num_samples": 2239,
+ "tasks": [],
+ "average_score": 0.2500331562865158
+ },
+ "Language Understanding and Generation": {
+ "count": 154,
+ "num_samples": 2509,
+ "tasks": [],
+ "average_score": 0.3003169369011028
+ },
+ "Scene and Event Understanding": {
+ "count": 154,
+ "num_samples": 2467,
+ "tasks": [],
+ "average_score": 0.31808748114668184
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 109,
+ "num_samples": 1910,
+ "tasks": [],
+ "average_score": 0.18281637763548025
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 51,
+ "num_samples": 855,
+ "tasks": [],
+ "average_score": 0.40732197204308807
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 15,
+ "num_samples": 245,
+ "tasks": [],
+ "average_score": 0.48798245614035085
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 77,
+ "num_samples": 1386,
+ "tasks": [],
+ "average_score": 0.23723675736151562
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 152,
+ "num_samples": 2437,
+ "tasks": [],
+ "average_score": 0.1968926733821904
+ },
+ "Planning and Decision Making": {
+ "count": 37,
+ "num_samples": 577,
+ "tasks": [],
+ "average_score": 0.08735883237069725
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 93,
+ "num_samples": 1517,
+ "tasks": [],
+ "average_score": 0.21195711598986072
+ },
+ "Text-Based Images and Documents": {
+ "count": 82,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.18639148159043903
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 101,
+ "num_samples": 1718,
+ "tasks": [],
+ "average_score": 0.21578309681746147
+ },
+ "Videos": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.3527537836840162
+ },
+ "Artistic and Creative Content": {
+ "count": 32,
+ "num_samples": 541,
+ "tasks": [],
+ "average_score": 0.3096882575625531
+ },
+ "Photographs": {
+ "count": 143,
+ "num_samples": 2248,
+ "tasks": [],
+ "average_score": 0.3176880312524649
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 11,
+ "num_samples": 169,
+ "tasks": [],
+ "average_score": 0.0755920550038197
+ }
+ },
+ "output_format": {
+ "contextual_formatted_text": {
+ "count": 98,
+ "num_samples": 1514,
+ "tasks": [],
+ "average_score": 0.23506388020592064
+ },
+ "structured_output": {
+ "count": 110,
+ "num_samples": 1714,
+ "tasks": [],
+ "average_score": 0.1781127776443048
+ },
+ "exact_text": {
+ "count": 83,
+ "num_samples": 1278,
+ "tasks": [],
+ "average_score": 0.2551275278138797
+ },
+ "numerical_data": {
+ "count": 49,
+ "num_samples": 862,
+ "tasks": [],
+ "average_score": 0.20833171754655547
+ },
+ "open_ended_output": {
+ "count": 80,
+ "num_samples": 1454,
+ "tasks": [],
+ "average_score": 0.36473950920880716
+ },
+ "multiple_choice": {
+ "count": 85,
+ "num_samples": 1363,
+ "tasks": [],
+ "average_score": 0.293386806641223
+ }
+ },
+ "input_num": {
+ "6-8 images": {
+ "count": 21,
+ "num_samples": 314,
+ "tasks": [],
+ "average_score": 0.13955971277399848
+ },
+ "9-image or more": {
+ "count": 41,
+ "num_samples": 623,
+ "tasks": [],
+ "average_score": 0.23596215721092323
+ },
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.26319603880798287
+ },
+ "video": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.3527537836840162
+ },
+ "4-5 images": {
+ "count": 34,
+ "num_samples": 520,
+ "tasks": [],
+ "average_score": 0.17888270664238365
+ },
+ "2-3 images": {
+ "count": 51,
+ "num_samples": 802,
+ "tasks": [],
+ "average_score": 0.22288558250834017
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 72,
+ "num_samples": 1124,
+ "tasks": [],
+ "average_score": 0.2666989364424082
+ },
+ "Planning": {
+ "count": 78,
+ "num_samples": 1239,
+ "tasks": [],
+ "average_score": 0.11693267119342445
+ },
+ "Coding": {
+ "count": 31,
+ "num_samples": 474,
+ "tasks": [],
+ "average_score": 0.15342045420318667
+ },
+ "Perception": {
+ "count": 145,
+ "num_samples": 2313,
+ "tasks": [],
+ "average_score": 0.29243044121840894
+ },
+ "Metrics": {
+ "count": 20,
+ "num_samples": 309,
+ "tasks": [],
+ "average_score": 0.3777897246686755
+ },
+ "Science": {
+ "count": 29,
+ "num_samples": 574,
+ "tasks": [],
+ "average_score": 0.25714862989687987
+ },
+ "Knowledge": {
+ "count": 97,
+ "num_samples": 1605,
+ "tasks": [],
+ "average_score": 0.33187729423141027
+ },
+ "Mathematics": {
+ "count": 33,
+ "num_samples": 547,
+ "tasks": [],
+ "average_score": 0.16493399805627715
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/Default/MiniCPM_v2.6/task_results.json b/static/eval_results/Default/MiniCPM_v2.6/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..aea366727ae846ca60d8703baaa0c037022a920f
--- /dev/null
+++ b/static/eval_results/Default/MiniCPM_v2.6/task_results.json
@@ -0,0 +1,7756 @@
+[
+ {
+ "name": "functionality_matching_in_different_objects",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_rating",
+ "score": 0.6265087790877971,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.04081632653061224,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "recipe_image_ordering",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "rocks_samples_identify",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "sign_language",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "2d_image_jigsaw_puzzle_easy",
+ "score": 0.1619047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "booking_web_recommendation",
+ "score": 0.17740929705215416,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.17647058823529413,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_dynamic_pref",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_keywords_based_retrieval_non_radiology",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.02040816326530612,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_hindi_next_step",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.5697307134254179,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_temporal",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.6842105263157895,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_rating",
+ "score": 0.8928571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_eval_visual_pref",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_parasite_detection",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.16666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_visualization_output_understanding",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 10,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.3,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.17647058823529413,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.025571294374413373,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_advanced",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiview_reasoning_camera_moving",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_action_recognition",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "music_sheet_author",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "rocks_samples_compare",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "comic_page_ordering",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_news_qa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "paper_review_acceptance",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "counting_multi_image",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "photoshop_operation",
+ "score": 0.15476190476190474,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "vln_identify_robot",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_tegulu_next_step",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "medical_abdomen_endscopy_organ_recognition",
+ "score": 0.023809523809523808,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.17401960784313725,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_direction_understanding",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "semantic_matching_of_two_images",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.13131313131313133,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pokemon_3D_recognition",
+ "score": 0.05,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "visual_correspondance_in_two_images",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video_eval_factual_pref",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_Python",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_event",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_name",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_content_based_retrieval_radiology",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_segments_reordering",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rebus",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "worldle",
+ "score": 0.12570039901308083,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.017857142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Movie_retrieval_by_actor",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "music_sheet_sentiment",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "code_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.17647058823529413,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_intent_recognition",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.08070309345387555,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ancient_map_understanding",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "media_recommend_solutions_stackoverflow",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "vln_english_next_step",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "waldo",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_polyp_segmentation_single_object_rater",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_match_problem",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_solution_compare",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_spatial",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "logical_reasoning_2d_folding",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_reasoning",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "TV_show_retrieval_by_character",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_location",
+ "score": 0.08484848484848483,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "planning_visual_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_abdomen_MRI_organ_recognition",
+ "score": 0.09523809523809523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_sign_recognition",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "Forensic_Detection_of_different_images",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.23684210526315788,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_2D_views_of_3D_shapes",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "dish_ingredient_match",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "media_homepage_profile",
+ "score": 0.039560439560439566,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_camera_motion_description",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "video"
+ },
+ {
+ "name": "google_streetview_line_sorting",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "move_pos_to_pos_hanoi_4_pole",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_line_reasoning",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_view",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_retrieval_given_surgeon_activity",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Videos",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_basic",
+ "score": 0.35294117647058826,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.3116385321052631,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.25003536329619036,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.2420634920634921,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_view_identification",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.6509561973369343,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_vehicle_multiview_counting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_video_moving_object_property_recognition",
+ "score": 0.6875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "clevrer_video_moving_object_count",
+ "score": 0.38095238095238093,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 21,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "sta_action_localization_video",
+ "score": 0.3125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "code_programming_test_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "graph_hamiltonian_cycle",
+ "score": 0.431547619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "vizwiz_quality_accessment_for_blind",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.04821164701888182,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.3482142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_check_two_images",
+ "score": 0.17777777777777776,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "dvqa",
+ "score": 0.631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cheapest_flight_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "code_programming_extremely_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_retrieval",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.2205833035621225,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_load_type_prediction_from_plot",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.9375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_multicamera_tracking_predict_bbox",
+ "score": 0.034222739980969856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "perception_test_video_character_order",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.753246753246753,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_mc",
+ "score": 0.9473684210526315,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.4511428571428572,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "algebra",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_sequence",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_similar_scene_retrieval",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "star_object_interaction_video",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.3684210526315789,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.15,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_moving_direction_video",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.7777777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "IAM_line_ocr_and_locate",
+ "score": 0.4546989887491169,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.4178571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_object_shuffle_video",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "muma_theory_of_mind_social_goal",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.4404761904761905,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "muma_theory_of_mind_belief_of_goal",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevr_arithmetic",
+ "score": 0.10526315789473684,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.8529071428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.09183673469387754,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.3214285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.24,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arc_agi",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.41964285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_to_camera_trajectory_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.4603174603174602,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.7184873949579832,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualdial_visual_dialog_image_guessing",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vlnqa_egocentric_navigation_video",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.24444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "face_identity_matching",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "music_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "clevrer_object_existence_video",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "cam_traj_to_video_selection",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_content_reasoning",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "3d_fragments_understanding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.3010526315789474,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_error_line_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_programming_test_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.21324372091628846,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logo2k_same_type_logo_retrieval",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.19999999999999998,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_prediction",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.05599999999999996,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "photo_sharing_image_retrieval",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_retrieval",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "emotion_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_human_multiview_counting",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "perception_test_video_action_count",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "stock_info_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.3512772395965795,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_path",
+ "score": 0.23809523809523808,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "coco_ood_global_image_retrieval_by_query_property",
+ "score": 0.42993197278911566,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.2630526315789474,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_T2I_reward",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_motion_matching_real_3D",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "video_motion_matching_3D_real",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.16666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.5476190476190477,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_position_images",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_xor_images",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_position_only",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_images",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat_position_only_length",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6470588235294118,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_youtube",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_iphone_settings",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_notes",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_word",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_alipay",
+ "score": 0.29411764705882354,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_twitter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_leetcode",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_tiktok",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_excel",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_instagram",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_ppt",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_zoom",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_amazon",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.17647058823529413,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_3d_model_texturing",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_subject",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_unmask",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_mask",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_aesthetics",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_motion_guided_editing",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_control",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_semantics",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.21284013605442173,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.10541383219954649,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ball_cup_swap_3",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geographic_remote_sensing_land_cover",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visual_prediction_rater_novel_view_synthesis",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_semantic_segmentation",
+ "score": 0.08333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_panoptic_segmentation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_surface_normal_estimation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_openable_part_segmentation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_plane_segmentation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_depth_estimation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_3d_assembled_quality_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.05555555555555555,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.042857142857142864,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.042857142857142864,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.042857142857142864,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_output_result",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.21052631578947367,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "painting_QA",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_add_tag",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.14285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.4444444444444444,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.45,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sceneqa_scene_transition_video",
+ "score": 0.24999999999999994,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_summary",
+ "score": 0.5000000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "funqa_unexpected_action_magic_video",
+ "score": 0.48000000000000004,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "paper_review_writing",
+ "score": 0.5066666666666667,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "activitynetqa",
+ "score": 0.38947368421052636,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.07,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "generated_video_artifacts",
+ "score": 0.1,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.4499999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "wikihow_complex_task_completion",
+ "score": 0.5444444444444445,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_detail_description",
+ "score": 0.4473684210526316,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "funqa_unexpected_action_creative_video",
+ "score": 0.2533333333333333,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.7842105263157896,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.12142857142857144,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_qa",
+ "score": 0.6642857142857144,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.5241379310344827,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.5785714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_oe",
+ "score": 0.31578947368421056,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.5344827586206897,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.396551724137931,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.6137931034482759,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_short_title",
+ "score": 0.6142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video2notes",
+ "score": 0.4357142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.17142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.7000000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "doc_vqa",
+ "score": 0.7187499999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.65,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.4799999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_content_follow_up",
+ "score": 0.3857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.48387096774193555,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.21379310344827582,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.5172413793103448,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.4714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.4068965517241379,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.3586206896551724,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_humor_video",
+ "score": 0.2533333333333333,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.6344827586206898,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.5241379310344827,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.7928571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.15714285714285717,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.39285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.43499999999999994,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.64,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_B",
+ "score": 0.905263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.68,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_A",
+ "score": 0.8842105263157897,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.7600000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8947368421052632,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.05714285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.1785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.3214285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Spanish",
+ "score": 0.1,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Russian",
+ "score": 0.09285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Arabic",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Chinese",
+ "score": 0.2,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_French",
+ "score": 0.21428571428571425,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.35000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.37142857142857144,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.14285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.26428571428571435,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.2642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.34285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/Default/NVLM/summary_results.json b/static/eval_results/Default/NVLM/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1f5960546368b4fa15bc67524b10eebb5393c2ca
--- /dev/null
+++ b/static/eval_results/Default/NVLM/summary_results.json
@@ -0,0 +1,251 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 440,
+ "num_eval_samples": 6539,
+ "macro_mean_score": 0.21589726765847422,
+ "micro_mean_score": 0.21406043849932396
+ },
+ "open": {
+ "num_eval_tasks": 65,
+ "num_eval_samples": 1163,
+ "macro_mean_score": 0.3478114310231307,
+ "micro_mean_score": 0.3947549441100602
+ },
+ "overall_score": 0.23287631838857856
+ },
+ "keyword_stats": {
+ "skills": {
+ "Object Recognition and Classification": {
+ "count": 303,
+ "num_samples": 4755,
+ "tasks": [],
+ "average_score": 0.21591473223174515
+ },
+ "Text Recognition (OCR)": {
+ "count": 137,
+ "num_samples": 2239,
+ "tasks": [],
+ "average_score": 0.27426258729618225
+ },
+ "Language Understanding and Generation": {
+ "count": 154,
+ "num_samples": 2509,
+ "tasks": [],
+ "average_score": 0.284874072963892
+ },
+ "Scene and Event Understanding": {
+ "count": 154,
+ "num_samples": 2467,
+ "tasks": [],
+ "average_score": 0.2134087963800149
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 109,
+ "num_samples": 1910,
+ "tasks": [],
+ "average_score": 0.2525993645909815
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 51,
+ "num_samples": 855,
+ "tasks": [],
+ "average_score": 0.4029543142569604
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 15,
+ "num_samples": 245,
+ "tasks": [],
+ "average_score": 0.4317142857142857
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 77,
+ "num_samples": 1386,
+ "tasks": [],
+ "average_score": 0.2442484196551863
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 152,
+ "num_samples": 2437,
+ "tasks": [],
+ "average_score": 0.1424318574406695
+ },
+ "Planning and Decision Making": {
+ "count": 37,
+ "num_samples": 577,
+ "tasks": [],
+ "average_score": 0.046798309600525674
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 93,
+ "num_samples": 1517,
+ "tasks": [],
+ "average_score": 0.19655048708297065
+ },
+ "Text-Based Images and Documents": {
+ "count": 82,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.18621338396242557
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 101,
+ "num_samples": 1718,
+ "tasks": [],
+ "average_score": 0.2922667531642391
+ },
+ "Videos": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.0
+ },
+ "Artistic and Creative Content": {
+ "count": 32,
+ "num_samples": 541,
+ "tasks": [],
+ "average_score": 0.3447361496776569
+ },
+ "Photographs": {
+ "count": 143,
+ "num_samples": 2248,
+ "tasks": [],
+ "average_score": 0.29674507895195534
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 11,
+ "num_samples": 169,
+ "tasks": [],
+ "average_score": 0.09716389574493003
+ }
+ },
+ "output_format": {
+ "contextual_formatted_text": {
+ "count": 98,
+ "num_samples": 1514,
+ "tasks": [],
+ "average_score": 0.19684666506287793
+ },
+ "structured_output": {
+ "count": 110,
+ "num_samples": 1714,
+ "tasks": [],
+ "average_score": 0.2199792859352912
+ },
+ "exact_text": {
+ "count": 83,
+ "num_samples": 1278,
+ "tasks": [],
+ "average_score": 0.25164831125437204
+ },
+ "numerical_data": {
+ "count": 49,
+ "num_samples": 862,
+ "tasks": [],
+ "average_score": 0.2396831363622878
+ },
+ "open_ended_output": {
+ "count": 80,
+ "num_samples": 1454,
+ "tasks": [],
+ "average_score": 0.3215948035793096
+ },
+ "multiple_choice": {
+ "count": 85,
+ "num_samples": 1363,
+ "tasks": [],
+ "average_score": 0.1853526865291571
+ }
+ },
+ "input_num": {
+ "6-8 images": {
+ "count": 21,
+ "num_samples": 314,
+ "tasks": [],
+ "average_score": 0.0
+ },
+ "9-image or more": {
+ "count": 41,
+ "num_samples": 623,
+ "tasks": [],
+ "average_score": 0.0
+ },
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.3352056263801705
+ },
+ "video": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.0
+ },
+ "4-5 images": {
+ "count": 34,
+ "num_samples": 520,
+ "tasks": [],
+ "average_score": 0.038244047619047615
+ },
+ "2-3 images": {
+ "count": 51,
+ "num_samples": 802,
+ "tasks": [],
+ "average_score": 0.2100484481849172
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 72,
+ "num_samples": 1124,
+ "tasks": [],
+ "average_score": 0.15704252277801936
+ },
+ "Planning": {
+ "count": 78,
+ "num_samples": 1239,
+ "tasks": [],
+ "average_score": 0.06688589450465973
+ },
+ "Coding": {
+ "count": 31,
+ "num_samples": 474,
+ "tasks": [],
+ "average_score": 0.2292747206409446
+ },
+ "Perception": {
+ "count": 145,
+ "num_samples": 2313,
+ "tasks": [],
+ "average_score": 0.2689383226748064
+ },
+ "Metrics": {
+ "count": 20,
+ "num_samples": 309,
+ "tasks": [],
+ "average_score": 0.18857142857142856
+ },
+ "Science": {
+ "count": 29,
+ "num_samples": 574,
+ "tasks": [],
+ "average_score": 0.23682040748983965
+ },
+ "Knowledge": {
+ "count": 97,
+ "num_samples": 1605,
+ "tasks": [],
+ "average_score": 0.3656649917873737
+ },
+ "Mathematics": {
+ "count": 33,
+ "num_samples": 547,
+ "tasks": [],
+ "average_score": 0.26866914106442213
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/Default/NVLM/task_results.json b/static/eval_results/Default/NVLM/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..38d4ee6da1a106eb733002cbbba4381cf1926f9b
--- /dev/null
+++ b/static/eval_results/Default/NVLM/task_results.json
@@ -0,0 +1,7756 @@
+[
+ {
+ "name": "face_identity_matching",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "IAM_line_ocr_and_locate",
+ "score": 0.03539274548424487,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_multicamera_tracking_predict_bbox",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevr_arithmetic",
+ "score": 0.21052631578947367,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualdial_visual_dialog_image_guessing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_object_existence_video",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.2755102040816326,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "star_object_interaction_video",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_content_reasoning",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_similar_scene_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cam_traj_to_video_selection",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "photo_sharing_image_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "perception_test_video_character_order",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "coco_ood_global_image_retrieval_by_query_property",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "arc_agi",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "vizwiz_quality_accessment_for_blind",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "av_human_multiview_counting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "emotion_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.68,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.39920016568805217,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.0688028531942305,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "dvqa",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.1841206696719848,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_moving_direction_video",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.7012987012987013,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_cycle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "av_vehicle_multiview_counting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.13760548426771885,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.5240000000000001,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sta_action_localization_video",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.37558289757353336,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_object_shuffle_video",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_to_camera_trajectory_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "cheapest_flight_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.35555555555555546,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "muma_theory_of_mind_social_goal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "clevrer_video_moving_object_property_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "action_sequence",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "game_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "av_view_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "code_programming_test_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "muma_theory_of_mind_belief_of_goal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "code_programming_extremely_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.9075630252100841,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_fragments_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.43610526315789483,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.7460317460317459,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.4075492509529021,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vlnqa_egocentric_navigation_video",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "landmark_check_two_images",
+ "score": 0.6222222222222221,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_output_result",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.541578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.6185714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.3157894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_load_type_prediction_from_plot",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "code_error_line_identification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.8571428571428574,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_mc",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_video_moving_object_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 21,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "algebra",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.3125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.8056142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.343,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logo2k_same_type_logo_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.013955337071466666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.3731789473684211,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_path",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.2631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_video_action_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geographic_remote_sensing_land_cover",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.11428571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.12857142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.15714285714285717,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.028571428571428574,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_plane_segmentation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_novel_view_synthesis",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_3d_assembled_quality_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visual_prediction_rater_semantic_segmentation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_panoptic_segmentation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_surface_normal_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_depth_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_openable_part_segmentation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_abdomen_MRI_organ_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_correspondance_in_two_images",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_circle_reasoning",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vln_tegulu_next_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_factual_pref",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_polyp_segmentation_single_object_rater",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "recipe_image_ordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_visualization_output_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 10,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_view",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video_grounding_temporal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.05714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_blocksworld",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.1820853399616799,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_english_next_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "logical_reasoning_2d_folding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_abdomen_endscopy_organ_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "worldle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_keywords_based_retrieval_non_radiology",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sign_language",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_camera_motion_description",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_segments_reordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.009223028391365047,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.9,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_spatial",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_match_problem",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "music_sheet_sentiment",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.1836734693877551,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_visual_pref",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.2104138975306737,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "semantic_matching_of_two_images",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.061224489795918366,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.2631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_recommendation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "photoshop_operation",
+ "score": 0.014285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_homepage_profile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vln_hindi_next_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "media_recommend_solutions_stackoverflow",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "dish_ingredient_match",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "medical_retrieval_given_surgeon_activity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Videos",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "google_streetview_direction_understanding",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "music_sheet_name",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "code_translation_easy",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ancient_map_understanding",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_add_tag",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_2D_views_of_3D_shapes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "paper_review_acceptance",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "rocks_samples_compare",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_line_reasoning",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_intent_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "counting",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_solution_compare",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "2d_image_jigsaw_puzzle_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_content_based_retrieval_radiology",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.21212121212121213,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiview_reasoning_camera_moving",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rocks_samples_identify",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.29705882352941176,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_rating",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_grippers",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "comic_page_ordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_event",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.3997587534660852,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_robot",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "multilingual_news_qa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "move_pos_to_pos_hanoi_4_pole",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting_multi_image",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_translation_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.13690476190476192,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_sign_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.2380952380952381,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "painting_QA",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "music_sheet_author",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.007137000066986845,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_action_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "functionality_matching_in_different_objects",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "Movie_retrieval_by_actor",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "TV_show_retrieval_by_character",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "Forensic_Detection_of_different_images",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pokemon_3D_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_rating",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_basic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "waldo",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_line_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_eval_dynamic_pref",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.17543859649122806,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_Python",
+ "score": 0.08333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.47058823529411764,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.21428571428571433,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_parasite_detection",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.23529411764705882,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_location",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "rebus",
+ "score": 0.08695652173913043,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat_position_only_length",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_xor_images",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_images",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_position_only",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_contain_images",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.619047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_tiktok",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_notes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_alipay",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_instagram",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_ppt",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_leetcode",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_twitter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_zoom",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_iphone_settings",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_youtube",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_excel",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_word",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.27142857142857146,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6705882352941177,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_T2I_reward",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.5952380952380951,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_motion_matching_3D_real",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "video_motion_matching_real_3D",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ball_cup_swap_3",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.1364795918367347,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.1446995464852608,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_subject",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_control",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_aesthetics",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_mask",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_semantics",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_motion_guided_editing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "autorater_3d_model_texturing",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_unmask",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.05,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.21600000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.24285714285714288,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.6,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.7214285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.6214285714285713,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.7206896551724138,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "doc_vqa",
+ "score": 0.38125,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "generated_video_artifacts",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "sceneqa_scene_transition_video",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "nextqa_oe",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.4928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.7000000000000002,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.6571428571428573,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.7068965517241379,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_content_follow_up",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.7482758620689655,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "wikihow_complex_task_completion",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.65,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_detail_description",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video2notes",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "funqa_unexpected_action_magic_video",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_qa",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.5241379310344828,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_summary",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.2689655172413793,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_creative_video",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.6586206896551723,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.529032258064516,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.6620689655172413,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.6000000000000002,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.6599999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_humor_video",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_short_title",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.7307692307692307,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.4444444444444444,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "activitynetqa",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.23571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.6517241379310345,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.805263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.4689655172413794,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_writing",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.2,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.13571428571428573,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.1928571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.35,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.36428571428571427,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.19285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.1642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.45,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.3,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.5,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_B",
+ "score": 0.905263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.72,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.535,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_A",
+ "score": 0.8789473684210528,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.6699999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.9,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.3499999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Spanish",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Arabic",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Russian",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_French",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Chinese",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/Default/Phi-3.5-vision/summary_results.json b/static/eval_results/Default/Phi-3.5-vision/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1e9d5d25bb8ca28106310878ffdeebc6788d2f0c
--- /dev/null
+++ b/static/eval_results/Default/Phi-3.5-vision/summary_results.json
@@ -0,0 +1,251 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 440,
+ "num_eval_samples": 6539,
+ "macro_mean_score": 0.22995297916629392,
+ "micro_mean_score": 0.22708502951025372
+ },
+ "open": {
+ "num_eval_tasks": 65,
+ "num_eval_samples": 1163,
+ "macro_mean_score": 0.3947914647737769,
+ "micro_mean_score": 0.42459157351676696
+ },
+ "overall_score": 0.2511698139474551
+ },
+ "keyword_stats": {
+ "skills": {
+ "Object Recognition and Classification": {
+ "count": 303,
+ "num_samples": 4755,
+ "tasks": [],
+ "average_score": 0.2550326045763433
+ },
+ "Text Recognition (OCR)": {
+ "count": 137,
+ "num_samples": 2239,
+ "tasks": [],
+ "average_score": 0.24395249720074527
+ },
+ "Language Understanding and Generation": {
+ "count": 154,
+ "num_samples": 2509,
+ "tasks": [],
+ "average_score": 0.2858236369733704
+ },
+ "Scene and Event Understanding": {
+ "count": 154,
+ "num_samples": 2467,
+ "tasks": [],
+ "average_score": 0.29876274710122536
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 109,
+ "num_samples": 1910,
+ "tasks": [],
+ "average_score": 0.21972896566746963
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 51,
+ "num_samples": 855,
+ "tasks": [],
+ "average_score": 0.37513466171380355
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 15,
+ "num_samples": 245,
+ "tasks": [],
+ "average_score": 0.4713934837092732
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 77,
+ "num_samples": 1386,
+ "tasks": [],
+ "average_score": 0.25475240046465697
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 152,
+ "num_samples": 2437,
+ "tasks": [],
+ "average_score": 0.20386233377001492
+ },
+ "Planning and Decision Making": {
+ "count": 37,
+ "num_samples": 577,
+ "tasks": [],
+ "average_score": 0.06657701969095552
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 93,
+ "num_samples": 1517,
+ "tasks": [],
+ "average_score": 0.16556787388989183
+ },
+ "Text-Based Images and Documents": {
+ "count": 82,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.17989790940001513
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 101,
+ "num_samples": 1718,
+ "tasks": [],
+ "average_score": 0.2671646581690049
+ },
+ "Videos": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.24920333780186898
+ },
+ "Artistic and Creative Content": {
+ "count": 32,
+ "num_samples": 541,
+ "tasks": [],
+ "average_score": 0.3057560384411286
+ },
+ "Photographs": {
+ "count": 143,
+ "num_samples": 2248,
+ "tasks": [],
+ "average_score": 0.3341992361416253
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 11,
+ "num_samples": 169,
+ "tasks": [],
+ "average_score": 0.12884156381685322
+ }
+ },
+ "output_format": {
+ "contextual_formatted_text": {
+ "count": 98,
+ "num_samples": 1514,
+ "tasks": [],
+ "average_score": 0.20494682188374266
+ },
+ "structured_output": {
+ "count": 110,
+ "num_samples": 1714,
+ "tasks": [],
+ "average_score": 0.21180084406324556
+ },
+ "exact_text": {
+ "count": 83,
+ "num_samples": 1278,
+ "tasks": [],
+ "average_score": 0.2609992615064841
+ },
+ "numerical_data": {
+ "count": 49,
+ "num_samples": 862,
+ "tasks": [],
+ "average_score": 0.2149689274645855
+ },
+ "open_ended_output": {
+ "count": 80,
+ "num_samples": 1454,
+ "tasks": [],
+ "average_score": 0.365192668303297
+ },
+ "multiple_choice": {
+ "count": 85,
+ "num_samples": 1363,
+ "tasks": [],
+ "average_score": 0.2593652357274648
+ }
+ },
+ "input_num": {
+ "6-8 images": {
+ "count": 21,
+ "num_samples": 314,
+ "tasks": [],
+ "average_score": 0.10107709750566891
+ },
+ "9-image or more": {
+ "count": 41,
+ "num_samples": 623,
+ "tasks": [],
+ "average_score": 0.11861055655587921
+ },
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.2824151476986241
+ },
+ "video": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.24920333780186898
+ },
+ "4-5 images": {
+ "count": 34,
+ "num_samples": 520,
+ "tasks": [],
+ "average_score": 0.1980440594073205
+ },
+ "2-3 images": {
+ "count": 51,
+ "num_samples": 802,
+ "tasks": [],
+ "average_score": 0.2636292373854696
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 72,
+ "num_samples": 1124,
+ "tasks": [],
+ "average_score": 0.20747122167273002
+ },
+ "Planning": {
+ "count": 78,
+ "num_samples": 1239,
+ "tasks": [],
+ "average_score": 0.08602953103518936
+ },
+ "Coding": {
+ "count": 31,
+ "num_samples": 474,
+ "tasks": [],
+ "average_score": 0.20136893467064246
+ },
+ "Perception": {
+ "count": 145,
+ "num_samples": 2313,
+ "tasks": [],
+ "average_score": 0.30979039348232706
+ },
+ "Metrics": {
+ "count": 20,
+ "num_samples": 309,
+ "tasks": [],
+ "average_score": 0.3495072422622861
+ },
+ "Science": {
+ "count": 29,
+ "num_samples": 574,
+ "tasks": [],
+ "average_score": 0.25858403958844717
+ },
+ "Knowledge": {
+ "count": 97,
+ "num_samples": 1605,
+ "tasks": [],
+ "average_score": 0.3357218088688187
+ },
+ "Mathematics": {
+ "count": 33,
+ "num_samples": 547,
+ "tasks": [],
+ "average_score": 0.21140555087788399
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/Default/Phi-3.5-vision/task_results.json b/static/eval_results/Default/Phi-3.5-vision/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..eba9431f34beb2807358932d6431ddcd32db923e
--- /dev/null
+++ b/static/eval_results/Default/Phi-3.5-vision/task_results.json
@@ -0,0 +1,7756 @@
+[
+ {
+ "name": "functionality_matching_in_different_objects",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_rating",
+ "score": 0.6556210357219137,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.08163265306122448,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "recipe_image_ordering",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "rocks_samples_identify",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "sign_language",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "2d_image_jigsaw_puzzle_easy",
+ "score": 0.1738095238095238,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "booking_web_recommendation",
+ "score": 0.2679705215419501,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.4117647058823529,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_dynamic_pref",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_keywords_based_retrieval_non_radiology",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.10204081632653061,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_hindi_next_step",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.11483433671209539,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_temporal",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.42105263157894735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_rating",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_eval_visual_pref",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_parasite_detection",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.23809523809523808,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_visualization_output_understanding",
+ "score": 0.1,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 10,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.15,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.029751219517657808,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiview_reasoning_camera_moving",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.0326530612244898,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_action_recognition",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "music_sheet_author",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "rocks_samples_compare",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "comic_page_ordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.5773809523809524,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_news_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "paper_review_acceptance",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "counting_multi_image",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "photoshop_operation",
+ "score": 0.18452380952380953,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "vln_identify_robot",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.7777777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_tegulu_next_step",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "medical_abdomen_endscopy_organ_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.17401960784313728,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_direction_understanding",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "semantic_matching_of_two_images",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.12121212121212122,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.028571428571428574,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pokemon_3D_recognition",
+ "score": 0.03333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "visual_correspondance_in_two_images",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video_eval_factual_pref",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_Python",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_event",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_name",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_content_based_retrieval_radiology",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_segments_reordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rebus",
+ "score": 0.043478260869565216,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "worldle",
+ "score": 0.12530331014008708,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Movie_retrieval_by_actor",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "music_sheet_sentiment",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "code_retrieval",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.23529411764705882,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_intent_recognition",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.05222405626377131,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ancient_map_understanding",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "media_recommend_solutions_stackoverflow",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "vln_english_next_step",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "waldo",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_polyp_segmentation_single_object_rater",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_match_problem",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_solution_compare",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_spatial",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "logical_reasoning_2d_folding",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_reasoning",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "TV_show_retrieval_by_character",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_location",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "planning_visual_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_abdomen_MRI_organ_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_sign_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "Forensic_Detection_of_different_images",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.10526315789473684,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_2D_views_of_3D_shapes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "dish_ingredient_match",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "media_homepage_profile",
+ "score": 0.12833594976452117,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_camera_motion_description",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "video"
+ },
+ {
+ "name": "google_streetview_line_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "move_pos_to_pos_hanoi_4_pole",
+ "score": 0.0503968253968254,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_line_reasoning",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_view",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_retrieval_given_surgeon_activity",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Videos",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_basic",
+ "score": 0.29411764705882354,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.5203105263157894,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.4903669849303062,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.6468253968253969,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_view_identification",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.7891263675852077,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_vehicle_multiview_counting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.03700000000000008,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_video_moving_object_property_recognition",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "clevrer_video_moving_object_count",
+ "score": 0.19047619047619047,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 21,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "sta_action_localization_video",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "code_programming_test_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "graph_hamiltonian_cycle",
+ "score": 0.2642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "vizwiz_quality_accessment_for_blind",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.032984275722218924,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.11607142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_check_two_images",
+ "score": 0.3777777777777777,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "dvqa",
+ "score": 0.6842105263157895,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cheapest_flight_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "code_programming_extremely_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.3545876186407767,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_load_type_prediction_from_plot",
+ "score": 0.4523809523809523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.8125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_multicamera_tracking_predict_bbox",
+ "score": 0.034222739980969856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "perception_test_video_character_order",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.5194805194805193,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_mc",
+ "score": 0.631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.7397142857142861,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "algebra",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_sequence",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_similar_scene_retrieval",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "star_object_interaction_video",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.3157894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_moving_direction_video",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "IAM_line_ocr_and_locate",
+ "score": 0.39135519540824326,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.5469999999999999,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_object_shuffle_video",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "muma_theory_of_mind_social_goal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.16666666666666669,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "muma_theory_of_mind_belief_of_goal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevr_arithmetic",
+ "score": 0.3684210526315789,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.9458071428571426,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.2653061224489795,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.1285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.020833333333333332,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.52,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arc_agi",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.19642857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.018377650164657217,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_to_camera_trajectory_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.4523809523809524,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.6092436974789915,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualdial_visual_dialog_image_guessing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.40625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vlnqa_egocentric_navigation_video",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.24444444444444446,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "face_identity_matching",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "music_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "clevrer_object_existence_video",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "cam_traj_to_video_selection",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_content_reasoning",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "3d_fragments_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.6657894736842106,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_error_line_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_programming_test_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.21324372091628846,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logo2k_same_type_logo_retrieval",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.4666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.30357142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_prediction",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.28000000000000014,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "photo_sharing_image_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_retrieval",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "emotion_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_human_multiview_counting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "perception_test_video_action_count",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "stock_info_retrieval",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.34326122251920943,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_path",
+ "score": 0.44642857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "coco_ood_global_image_retrieval_by_query_property",
+ "score": 0.27456709956709957,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.34536842105263166,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_T2I_reward",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_motion_matching_real_3D",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "video_motion_matching_3D_real",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.4523809523809523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.4285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_position_images",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_xor_images",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_position_only",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_images",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat_position_only_length",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6470588235294118,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_youtube",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_iphone_settings",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_notes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_word",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_alipay",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_twitter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_leetcode",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_tiktok",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_excel",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_instagram",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_ppt",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_zoom",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.11764705882352941,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_3d_model_texturing",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_subject",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_unmask",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_mask",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_aesthetics",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_motion_guided_editing",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_control",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_semantics",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.09821428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.16808390022675737,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ball_cup_swap_3",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geographic_remote_sensing_land_cover",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visual_prediction_rater_novel_view_synthesis",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_semantic_segmentation",
+ "score": 0.10416666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_panoptic_segmentation",
+ "score": 0.09523809523809523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_surface_normal_estimation",
+ "score": 0.11904761904761904,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_openable_part_segmentation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_plane_segmentation",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_depth_estimation",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_3d_assembled_quality_understanding",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.05555555555555555,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.08571428571428572,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_output_result",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "painting_QA",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_add_tag",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.6724137931034484,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.37857142857142856,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.510344827586207,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.2448275862068965,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sceneqa_scene_transition_video",
+ "score": 0.2,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "wikihow_complex_task_completion",
+ "score": 0.3888888888888889,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "funqa_unexpected_action_humor_video",
+ "score": 0.24000000000000005,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.7857142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.4857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.6241379310344828,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "activitynetqa",
+ "score": 0.5,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "generated_video_artifacts",
+ "score": 0.125,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "paper_review_writing",
+ "score": 0.43333333333333335,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.3555555555555555,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_creative_video",
+ "score": 0.18666666666666665,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.5655172413793104,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.49999999999999994,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.7241379310344829,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "doc_vqa",
+ "score": 0.6562499999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "video2notes",
+ "score": 0.4571428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.45862068965517233,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_short_title",
+ "score": 0.1857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.24666666666666667,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_oe",
+ "score": 0.25789473684210523,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.45000000000000007,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_summary",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.3931034482758621,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.47857142857142865,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_content_follow_up",
+ "score": 0.4642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.2857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_qa",
+ "score": 0.6214285714285713,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.6076923076923078,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.6285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.052000000000000005,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.5857142857142856,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.40967741935483876,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_magic_video",
+ "score": 0.26666666666666666,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.6310344827586207,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.7526315789473683,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_detail_description",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.49285714285714277,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.07142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.14999999999999997,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.014285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.0642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.09999999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.05714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.49999999999999994,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.5785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.3714285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.2642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Spanish",
+ "score": 0.06428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_French",
+ "score": 0.05714285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Russian",
+ "score": 0.04285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Chinese",
+ "score": 0.049999999999999996,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Arabic",
+ "score": 0.0071428571428571435,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "red_teaming_visual_order_B",
+ "score": 0.8578947368421055,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8894736842105264,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.8950000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.655,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.915,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_A",
+ "score": 0.8631578947368422,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.7900000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/Default/Pixtral_12B/summary_results.json b/static/eval_results/Default/Pixtral_12B/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..d7b2c538d50bf2b1e42d3ba272fa87d54e676a20
--- /dev/null
+++ b/static/eval_results/Default/Pixtral_12B/summary_results.json
@@ -0,0 +1,251 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 440,
+ "num_eval_samples": 6539,
+ "macro_mean_score": 0.31362045151669854,
+ "micro_mean_score": 0.3100986209078182
+ },
+ "open": {
+ "num_eval_tasks": 65,
+ "num_eval_samples": 1163,
+ "macro_mean_score": 0.4566234428542061,
+ "micro_mean_score": 0.4870593293207223
+ },
+ "overall_score": 0.33202677713439754
+ },
+ "keyword_stats": {
+ "skills": {
+ "Object Recognition and Classification": {
+ "count": 303,
+ "num_samples": 4755,
+ "tasks": [],
+ "average_score": 0.34184129499032456
+ },
+ "Text Recognition (OCR)": {
+ "count": 137,
+ "num_samples": 2239,
+ "tasks": [],
+ "average_score": 0.37667712211439836
+ },
+ "Language Understanding and Generation": {
+ "count": 154,
+ "num_samples": 2509,
+ "tasks": [],
+ "average_score": 0.37896441862738645
+ },
+ "Scene and Event Understanding": {
+ "count": 154,
+ "num_samples": 2467,
+ "tasks": [],
+ "average_score": 0.37077191302051077
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 109,
+ "num_samples": 1910,
+ "tasks": [],
+ "average_score": 0.2843861774995234
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 51,
+ "num_samples": 855,
+ "tasks": [],
+ "average_score": 0.4098150360139686
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 15,
+ "num_samples": 245,
+ "tasks": [],
+ "average_score": 0.533077694235589
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 77,
+ "num_samples": 1386,
+ "tasks": [],
+ "average_score": 0.3372902862054838
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 152,
+ "num_samples": 2437,
+ "tasks": [],
+ "average_score": 0.25372282838901716
+ },
+ "Planning and Decision Making": {
+ "count": 37,
+ "num_samples": 577,
+ "tasks": [],
+ "average_score": 0.09524894246403817
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 93,
+ "num_samples": 1517,
+ "tasks": [],
+ "average_score": 0.2972619996610934
+ },
+ "Text-Based Images and Documents": {
+ "count": 82,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.28304049684103855
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 101,
+ "num_samples": 1718,
+ "tasks": [],
+ "average_score": 0.33523333364720703
+ },
+ "Videos": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.3988260865341648
+ },
+ "Artistic and Creative Content": {
+ "count": 32,
+ "num_samples": 541,
+ "tasks": [],
+ "average_score": 0.39117521970978353
+ },
+ "Photographs": {
+ "count": 143,
+ "num_samples": 2248,
+ "tasks": [],
+ "average_score": 0.35583482417594536
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 11,
+ "num_samples": 169,
+ "tasks": [],
+ "average_score": 0.21897822147396953
+ }
+ },
+ "output_format": {
+ "contextual_formatted_text": {
+ "count": 98,
+ "num_samples": 1514,
+ "tasks": [],
+ "average_score": 0.3436473210057542
+ },
+ "structured_output": {
+ "count": 110,
+ "num_samples": 1714,
+ "tasks": [],
+ "average_score": 0.28979044279399635
+ },
+ "exact_text": {
+ "count": 83,
+ "num_samples": 1278,
+ "tasks": [],
+ "average_score": 0.33530850344530555
+ },
+ "numerical_data": {
+ "count": 49,
+ "num_samples": 862,
+ "tasks": [],
+ "average_score": 0.30160980000905374
+ },
+ "open_ended_output": {
+ "count": 80,
+ "num_samples": 1454,
+ "tasks": [],
+ "average_score": 0.4166613092238044
+ },
+ "multiple_choice": {
+ "count": 85,
+ "num_samples": 1363,
+ "tasks": [],
+ "average_score": 0.30796171250186904
+ }
+ },
+ "input_num": {
+ "6-8 images": {
+ "count": 21,
+ "num_samples": 314,
+ "tasks": [],
+ "average_score": 0.22871315192743763
+ },
+ "9-image or more": {
+ "count": 41,
+ "num_samples": 623,
+ "tasks": [],
+ "average_score": 0.21669652626580332
+ },
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.36087312117067055
+ },
+ "video": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.3988260865341648
+ },
+ "4-5 images": {
+ "count": 34,
+ "num_samples": 520,
+ "tasks": [],
+ "average_score": 0.24616927284658197
+ },
+ "2-3 images": {
+ "count": 51,
+ "num_samples": 802,
+ "tasks": [],
+ "average_score": 0.2900329121369093
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 72,
+ "num_samples": 1124,
+ "tasks": [],
+ "average_score": 0.42652313209316933
+ },
+ "Planning": {
+ "count": 78,
+ "num_samples": 1239,
+ "tasks": [],
+ "average_score": 0.1209559708312353
+ },
+ "Coding": {
+ "count": 31,
+ "num_samples": 474,
+ "tasks": [],
+ "average_score": 0.25678368121442124
+ },
+ "Perception": {
+ "count": 145,
+ "num_samples": 2313,
+ "tasks": [],
+ "average_score": 0.37605128363484847
+ },
+ "Metrics": {
+ "count": 20,
+ "num_samples": 309,
+ "tasks": [],
+ "average_score": 0.4576088857728113
+ },
+ "Science": {
+ "count": 29,
+ "num_samples": 574,
+ "tasks": [],
+ "average_score": 0.3464929909487855
+ },
+ "Knowledge": {
+ "count": 97,
+ "num_samples": 1605,
+ "tasks": [],
+ "average_score": 0.3858431845580602
+ },
+ "Mathematics": {
+ "count": 33,
+ "num_samples": 547,
+ "tasks": [],
+ "average_score": 0.2549787156825223
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/Default/Pixtral_12B/task_results.json b/static/eval_results/Default/Pixtral_12B/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..39841de7705add7cd173cac13a05a4acf825453e
--- /dev/null
+++ b/static/eval_results/Default/Pixtral_12B/task_results.json
@@ -0,0 +1,7756 @@
+[
+ {
+ "name": "face_identity_matching",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "IAM_line_ocr_and_locate",
+ "score": 0.501198167796616,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_retrieval",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_multicamera_tracking_predict_bbox",
+ "score": 0.021818162950542508,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevr_arithmetic",
+ "score": 0.3684210526315789,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualdial_visual_dialog_image_guessing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_object_existence_video",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.5204081632653061,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "star_object_interaction_video",
+ "score": 0.3125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_content_reasoning",
+ "score": 0.8888888888888888,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_similar_scene_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cam_traj_to_video_selection",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "photo_sharing_image_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "perception_test_video_character_order",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "coco_ood_global_image_retrieval_by_query_property",
+ "score": 0.6271825396825397,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "arc_agi",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "vizwiz_quality_accessment_for_blind",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "av_human_multiview_counting",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "emotion_recognition",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.72,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.20833333333333334,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.24488748504919397,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.0453150580624047,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "dvqa",
+ "score": 0.6842105263157895,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.6312894180411065,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_moving_direction_video",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_cycle",
+ "score": 0.20833333333333334,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "av_vehicle_multiview_counting",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.23775530155879165,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.6880000000000002,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sta_action_localization_video",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.6488432934203903,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_object_shuffle_video",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "video_to_camera_trajectory_retrieval",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "cheapest_flight_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_retrieval",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "muma_theory_of_mind_social_goal",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "clevrer_video_moving_object_property_recognition",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "action_sequence",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "game_info_retrieval",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "av_view_identification",
+ "score": 0.04444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "code_programming_test_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "muma_theory_of_mind_belief_of_goal",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "code_programming_extremely_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.6176470588235295,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_fragments_understanding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.6796315789473685,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.6666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.5612662313600751,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vlnqa_egocentric_navigation_video",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "landmark_check_two_images",
+ "score": 0.6222222222222221,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.497842105263158,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.7906428571428573,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_load_type_prediction_from_plot",
+ "score": 0.4404761904761904,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.48571428571428577,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_prediction",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "code_error_line_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.84375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.3392857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_mc",
+ "score": 0.7894736842105263,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_retrieval",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_video_moving_object_count",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 21,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "algebra",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.9798999999999998,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.5264285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logo2k_same_type_logo_retrieval",
+ "score": 0.6785714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.15948320227617638,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.48810526315789476,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_path",
+ "score": 0.33511904761904754,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.3684210526315789,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.6964285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.48809523809523814,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_video_action_count",
+ "score": 0.3125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.2888888888888889,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geographic_remote_sensing_land_cover",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.05555555555555555,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.12857142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.17142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.1571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_plane_segmentation",
+ "score": 0.17777777777777776,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_novel_view_synthesis",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_3d_assembled_quality_understanding",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visual_prediction_rater_semantic_segmentation",
+ "score": 0.10416666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_panoptic_segmentation",
+ "score": 0.09523809523809523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_surface_normal_estimation",
+ "score": 0.16666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_depth_estimation",
+ "score": 0.13095238095238096,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_openable_part_segmentation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_abdomen_MRI_organ_recognition",
+ "score": 0.23809523809523808,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_correspondance_in_two_images",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_circle_reasoning",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vln_tegulu_next_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_factual_pref",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_polyp_segmentation_single_object_rater",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "recipe_image_ordering",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_visualization_output_understanding",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 10,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_view",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video_grounding_temporal",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.21684615697123963,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_english_next_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "logical_reasoning_2d_folding",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_abdomen_endscopy_organ_recognition",
+ "score": 0.047619047619047616,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "worldle",
+ "score": 0.17302345614703815,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_keywords_based_retrieval_non_radiology",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sign_language",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_camera_motion_description",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_segments_reordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.038726333907056806,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.8869047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_spatial",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_match_problem",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "music_sheet_sentiment",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.12244897959183673,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_visual_pref",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.3963641386180832,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "semantic_matching_of_two_images",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.08163265306122448,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_retrieval",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_recommendation",
+ "score": 0.5994897959183675,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "photoshop_operation",
+ "score": 0.16972789115646256,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_homepage_profile",
+ "score": 0.10810255920550038,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vln_hindi_next_step",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "media_recommend_solutions_stackoverflow",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "dish_ingredient_match",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "medical_retrieval_given_surgeon_activity",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Videos",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "google_streetview_direction_understanding",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "music_sheet_name",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "code_translation_easy",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ancient_map_understanding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_2D_views_of_3D_shapes",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "paper_review_acceptance",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "rocks_samples_compare",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_line_reasoning",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_intent_recognition",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "counting",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.5952380952380951,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_solution_compare",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "2d_image_jigsaw_puzzle_easy",
+ "score": 0.19047619047619044,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_content_based_retrieval_radiology",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.23232323232323232,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiview_reasoning_camera_moving",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rocks_samples_identify",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.31813725490196076,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_rating",
+ "score": 0.8928571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "comic_page_ordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_event",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.5707553325488358,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_robot",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "multilingual_news_qa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "move_pos_to_pos_hanoi_4_pole",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting_multi_image",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_translation_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_sign_recognition",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.3095238095238096,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_author",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.011188369871131833,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_action_recognition",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "functionality_matching_in_different_objects",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "Movie_retrieval_by_actor",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "TV_show_retrieval_by_character",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "Forensic_Detection_of_different_images",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pokemon_3D_recognition",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_rating",
+ "score": 0.6077729535514645,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "number_comparison",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_basic",
+ "score": 0.47058823529411764,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "waldo",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_line_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_eval_dynamic_pref",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.34210526315789475,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_Python",
+ "score": 0.22916666666666669,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.5294117647058824,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.1857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_parasite_detection",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.23529411764705882,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_location",
+ "score": 0.14545454545454545,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "rebus",
+ "score": 0.043478260869565216,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat_position_only_length",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_xor_images",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_images",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_position_only",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_contain_images",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.7380952380952381,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_tiktok",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_notes",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_alipay",
+ "score": 0.11764705882352941,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_amazon",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_instagram",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_ppt",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_leetcode",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_twitter",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_zoom",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_iphone_settings",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_youtube",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_excel",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_word",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.021428571428571432,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6000000000000002,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_T2I_reward",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_motion_matching_3D_real",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "video_motion_matching_real_3D",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ball_cup_swap_3",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.30025510204081635,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.22222222222222224,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.3,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_subject",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_control",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_aesthetics",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_mask",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_semantics",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_motion_guided_editing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "autorater_3d_model_texturing",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_unmask",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.5882352941176471,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_output_result",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.3684210526315789,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_add_tag",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "painting_QA",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.19799999999999998,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.21428571428571427,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.45714285714285713,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.8571428571428573,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.5428571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.7275862068965517,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "doc_vqa",
+ "score": 0.8312500000000002,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "generated_video_artifacts",
+ "score": 0.4375,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "sceneqa_scene_transition_video",
+ "score": 0.19999999999999998,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "nextqa_oe",
+ "score": 0.35263157894736835,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.5071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.7137931034482758,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.5142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.7827586206896552,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_content_follow_up",
+ "score": 0.65,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.7206896551724138,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "wikihow_complex_task_completion",
+ "score": 0.6888888888888888,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.5928571428571427,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_detail_description",
+ "score": 0.45263157894736833,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video2notes",
+ "score": 0.5285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "funqa_unexpected_action_magic_video",
+ "score": 0.48666666666666664,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_qa",
+ "score": 0.7928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.5896551724137932,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_summary",
+ "score": 0.5785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.27586206896551724,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_creative_video",
+ "score": 0.24666666666666667,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.6172413793103447,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.4387096774193549,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.7448275862068967,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.18571428571428575,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.4666666666666668,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_humor_video",
+ "score": 0.28666666666666674,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_short_title",
+ "score": 0.65,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.7384615384615386,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.43333333333333346,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "activitynetqa",
+ "score": 0.3631578947368421,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.37142857142857144,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.5689655172413793,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.8000000000000002,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.48965517241379314,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_writing",
+ "score": 0.5533333333333333,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.2285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.17142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.08571428571428573,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.021428571428571432,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.0071428571428571435,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.028571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.0071428571428571435,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.07142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.021428571428571432,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.4571428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_B",
+ "score": 0.8842105263157897,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.7700000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.725,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_A",
+ "score": 0.905263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.6799999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8947368421052632,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.53,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Spanish",
+ "score": 0.33571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Arabic",
+ "score": 0.16428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Russian",
+ "score": 0.29285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_French",
+ "score": 0.3428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Chinese",
+ "score": 0.16428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.14285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/Default/Qwen2_VL_2B/summary_results.json b/static/eval_results/Default/Qwen2_VL_2B/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..76a71eabec4ee5d88551bf968f232ee13dffdc5a
--- /dev/null
+++ b/static/eval_results/Default/Qwen2_VL_2B/summary_results.json
@@ -0,0 +1,251 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 440,
+ "num_eval_samples": 6539,
+ "macro_mean_score": 0.20877163406364055,
+ "micro_mean_score": 0.20561526268932287
+ },
+ "open": {
+ "num_eval_tasks": 65,
+ "num_eval_samples": 1163,
+ "macro_mean_score": 0.3154302566225611,
+ "micro_mean_score": 0.33856405846947557
+ },
+ "overall_score": 0.22249997162072932
+ },
+ "keyword_stats": {
+ "skills": {
+ "Object Recognition and Classification": {
+ "count": 303,
+ "num_samples": 4755,
+ "tasks": [],
+ "average_score": 0.22236161923122505
+ },
+ "Text Recognition (OCR)": {
+ "count": 137,
+ "num_samples": 2239,
+ "tasks": [],
+ "average_score": 0.23701014663017753
+ },
+ "Language Understanding and Generation": {
+ "count": 154,
+ "num_samples": 2509,
+ "tasks": [],
+ "average_score": 0.25669221785292334
+ },
+ "Scene and Event Understanding": {
+ "count": 154,
+ "num_samples": 2467,
+ "tasks": [],
+ "average_score": 0.26526414975225454
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 109,
+ "num_samples": 1910,
+ "tasks": [],
+ "average_score": 0.17623548305581763
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 51,
+ "num_samples": 855,
+ "tasks": [],
+ "average_score": 0.31250702198481506
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 15,
+ "num_samples": 245,
+ "tasks": [],
+ "average_score": 0.4140676691729323
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 77,
+ "num_samples": 1386,
+ "tasks": [],
+ "average_score": 0.20802820480076603
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 152,
+ "num_samples": 2437,
+ "tasks": [],
+ "average_score": 0.17320633068307653
+ },
+ "Planning and Decision Making": {
+ "count": 37,
+ "num_samples": 577,
+ "tasks": [],
+ "average_score": 0.06209506566980099
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 93,
+ "num_samples": 1517,
+ "tasks": [],
+ "average_score": 0.190837839372028
+ },
+ "Text-Based Images and Documents": {
+ "count": 82,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.16287824421269087
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 101,
+ "num_samples": 1718,
+ "tasks": [],
+ "average_score": 0.19640906475019812
+ },
+ "Videos": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.2520741776922928
+ },
+ "Artistic and Creative Content": {
+ "count": 32,
+ "num_samples": 541,
+ "tasks": [],
+ "average_score": 0.24883076673424442
+ },
+ "Photographs": {
+ "count": 143,
+ "num_samples": 2248,
+ "tasks": [],
+ "average_score": 0.2877316297453947
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 11,
+ "num_samples": 169,
+ "tasks": [],
+ "average_score": 0.13398525561847363
+ }
+ },
+ "output_format": {
+ "contextual_formatted_text": {
+ "count": 98,
+ "num_samples": 1514,
+ "tasks": [],
+ "average_score": 0.1624451002757208
+ },
+ "structured_output": {
+ "count": 110,
+ "num_samples": 1714,
+ "tasks": [],
+ "average_score": 0.20960092816529263
+ },
+ "exact_text": {
+ "count": 83,
+ "num_samples": 1278,
+ "tasks": [],
+ "average_score": 0.19986806708136184
+ },
+ "numerical_data": {
+ "count": 49,
+ "num_samples": 862,
+ "tasks": [],
+ "average_score": 0.2201024015934558
+ },
+ "open_ended_output": {
+ "count": 80,
+ "num_samples": 1454,
+ "tasks": [],
+ "average_score": 0.30248748033122763
+ },
+ "multiple_choice": {
+ "count": 85,
+ "num_samples": 1363,
+ "tasks": [],
+ "average_score": 0.256631742010999
+ }
+ },
+ "input_num": {
+ "6-8 images": {
+ "count": 21,
+ "num_samples": 314,
+ "tasks": [],
+ "average_score": 0.07681405895691609
+ },
+ "9-image or more": {
+ "count": 41,
+ "num_samples": 623,
+ "tasks": [],
+ "average_score": 0.10526691703628158
+ },
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.25018977062352593
+ },
+ "video": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.2520741776922928
+ },
+ "4-5 images": {
+ "count": 34,
+ "num_samples": 520,
+ "tasks": [],
+ "average_score": 0.17435940889565366
+ },
+ "2-3 images": {
+ "count": 51,
+ "num_samples": 802,
+ "tasks": [],
+ "average_score": 0.21286783416184518
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 72,
+ "num_samples": 1124,
+ "tasks": [],
+ "average_score": 0.2521972668785968
+ },
+ "Planning": {
+ "count": 78,
+ "num_samples": 1239,
+ "tasks": [],
+ "average_score": 0.06967138760493456
+ },
+ "Coding": {
+ "count": 31,
+ "num_samples": 474,
+ "tasks": [],
+ "average_score": 0.16996250112948405
+ },
+ "Perception": {
+ "count": 145,
+ "num_samples": 2313,
+ "tasks": [],
+ "average_score": 0.27603334911345223
+ },
+ "Metrics": {
+ "count": 20,
+ "num_samples": 309,
+ "tasks": [],
+ "average_score": 0.31002436092347696
+ },
+ "Science": {
+ "count": 29,
+ "num_samples": 574,
+ "tasks": [],
+ "average_score": 0.21061929716065056
+ },
+ "Knowledge": {
+ "count": 97,
+ "num_samples": 1605,
+ "tasks": [],
+ "average_score": 0.2656728023444808
+ },
+ "Mathematics": {
+ "count": 33,
+ "num_samples": 547,
+ "tasks": [],
+ "average_score": 0.16356158787929762
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/Default/Qwen2_VL_2B/task_results.json b/static/eval_results/Default/Qwen2_VL_2B/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1731cf4310404cc32e815ba9a38308438fd0969f
--- /dev/null
+++ b/static/eval_results/Default/Qwen2_VL_2B/task_results.json
@@ -0,0 +1,7756 @@
+[
+ {
+ "name": "functionality_matching_in_different_objects",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_rating",
+ "score": 0.6794157898981115,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.04081632653061224,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "recipe_image_ordering",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "rocks_samples_identify",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "sign_language",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "2d_image_jigsaw_puzzle_easy",
+ "score": 0.20119047619047617,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "booking_web_recommendation",
+ "score": 0.4712868480725624,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "painting_QA",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.17647058823529413,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_dynamic_pref",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_keywords_based_retrieval_non_radiology",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.02040816326530612,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_hindi_next_step",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.5699819523580396,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_temporal",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.5789473684210527,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_rating",
+ "score": 0.8928571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_eval_visual_pref",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_parasite_detection",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.2380952380952381,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_visualization_output_understanding",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 10,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.11764705882352941,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.03355324641748354,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiview_reasoning_camera_moving",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.09455782312925169,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_action_recognition",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_author",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "rocks_samples_compare",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "comic_page_ordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.6476190476190474,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_news_qa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "paper_review_acceptance",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "counting_multi_image",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "photoshop_operation",
+ "score": 0.08333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "vln_identify_robot",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_tegulu_next_step",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "medical_abdomen_endscopy_organ_recognition",
+ "score": 0.047619047619047616,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.17647058823529413,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_direction_understanding",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "semantic_matching_of_two_images",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.10101010101010101,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.028571428571428574,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pokemon_3D_recognition",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "visual_correspondance_in_two_images",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video_eval_factual_pref",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_Python",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_event",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_name",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_content_based_retrieval_radiology",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_segments_reordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rebus",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "worldle",
+ "score": 0.15426188418265965,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.023809523809523808,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Movie_retrieval_by_actor",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "music_sheet_sentiment",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "code_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.11764705882352941,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_intent_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.068472026925481,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.06265503408592417,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ancient_map_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "media_recommend_solutions_stackoverflow",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "vln_english_next_step",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "waldo",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_polyp_segmentation_single_object_rater",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_match_problem",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_solution_compare",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_spatial",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "logical_reasoning_2d_folding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_reasoning",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "TV_show_retrieval_by_character",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_location",
+ "score": 0.10303030303030303,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "planning_visual_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_abdomen_MRI_organ_recognition",
+ "score": 0.09523809523809523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_sign_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "Forensic_Detection_of_different_images",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.10526315789473684,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_2D_views_of_3D_shapes",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "dish_ingredient_match",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "media_homepage_profile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_camera_motion_description",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "video"
+ },
+ {
+ "name": "google_streetview_line_sorting",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "move_pos_to_pos_hanoi_4_pole",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_line_reasoning",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_view",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_retrieval_given_surgeon_activity",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Videos",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_basic",
+ "score": 0.29411764705882354,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_add_tag",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.33333333333333337,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.36160000000000003,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.5028221142675466,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.734126984126984,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_view_identification",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.3707918542570349,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_vehicle_multiview_counting",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_video_moving_object_property_recognition",
+ "score": 0.6875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "clevrer_video_moving_object_count",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 21,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "sta_action_localization_video",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "code_programming_test_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "graph_hamiltonian_cycle",
+ "score": 0.03214285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vizwiz_quality_accessment_for_blind",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_check_two_images",
+ "score": 0.15555555555555553,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "dvqa",
+ "score": 0.6842105263157895,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cheapest_flight_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "code_programming_extremely_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.1691847451654077,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_load_type_prediction_from_plot",
+ "score": 0.3095238095238095,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.3125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_multicamera_tracking_predict_bbox",
+ "score": 0.034222739980969856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "perception_test_video_character_order",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_mc",
+ "score": 0.7894736842105263,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.28414285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "algebra",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_sequence",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_similar_scene_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "star_object_interaction_video",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.2631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.08333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_moving_direction_video",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "IAM_line_ocr_and_locate",
+ "score": 0.31341468102097647,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.4668571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_object_shuffle_video",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "muma_theory_of_mind_social_goal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.5952380952380952,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "muma_theory_of_mind_belief_of_goal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevr_arithmetic",
+ "score": 0.3684210526315789,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.7238999999999999,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.6224489795918366,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arc_agi",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.6696428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.04162492664914846,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_to_camera_trajectory_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.6031746031746031,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.9201680672268909,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualdial_visual_dialog_image_guessing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vlnqa_egocentric_navigation_video",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_output_result",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.35555555555555557,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "face_identity_matching",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "music_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "clevrer_object_existence_video",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "cam_traj_to_video_selection",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_content_reasoning",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "3d_fragments_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.5305263157894736,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_error_line_identification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_programming_test_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.1433110081487998,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logo2k_same_type_logo_retrieval",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.3684210526315789,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.24444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.5178571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_prediction",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.2286666666666668,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "photo_sharing_image_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "emotion_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_human_multiview_counting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "perception_test_video_action_count",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "stock_info_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.4155970724620079,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_path",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "coco_ood_global_image_retrieval_by_query_property",
+ "score": 0.3239260739260739,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.513578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_T2I_reward",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.014285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_motion_matching_real_3D",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "video_motion_matching_3D_real",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.42857142857142866,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.488095238095238,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_position_images",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_xor_images",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_position_only",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_images",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat_position_only_length",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6588235294117649,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_youtube",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_iphone_settings",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_notes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_word",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_alipay",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_twitter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_leetcode",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_tiktok",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_excel",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_instagram",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_ppt",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_zoom",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_3d_model_texturing",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_subject",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_unmask",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_mask",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_aesthetics",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_motion_guided_editing",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_control",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_semantics",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.20493197278911562,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.07681405895691609,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ball_cup_swap_3",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geographic_remote_sensing_land_cover",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visual_prediction_rater_novel_view_synthesis",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_semantic_segmentation",
+ "score": 0.10416666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_panoptic_segmentation",
+ "score": 0.09523809523809523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_surface_normal_estimation",
+ "score": 0.09523809523809523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_openable_part_segmentation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_plane_segmentation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_depth_estimation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_3d_assembled_quality_understanding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.05555555555555555,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.028571428571428574,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.014285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.014285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.3482758620689654,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.17142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.2689655172413793,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.21379310344827587,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sceneqa_scene_transition_video",
+ "score": 0.2285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "wikihow_complex_task_completion",
+ "score": 0.15555555555555556,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "funqa_unexpected_action_humor_video",
+ "score": 0.24666666666666665,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.46428571428571436,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.1928571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.5689655172413793,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "activitynetqa",
+ "score": 0.4789473684210526,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "generated_video_artifacts",
+ "score": 0.1125,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "paper_review_writing",
+ "score": 0.4799999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.1111111111111111,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_creative_video",
+ "score": 0.22666666666666666,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.28620689655172404,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.42758620689655163,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.4206896551724139,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "doc_vqa",
+ "score": 0.7187500000000002,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "video2notes",
+ "score": 0.15,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.31379310344827593,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_short_title",
+ "score": 0.557142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.3533333333333333,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_oe",
+ "score": 0.3,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.47857142857142854,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_summary",
+ "score": 0.2857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.3482758620689655,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.0071428571428571435,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_content_follow_up",
+ "score": 0.3714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.14285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_qa",
+ "score": 0.49999999999999994,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.55,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.15,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.07,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.09285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.4548387096774193,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_magic_video",
+ "score": 0.3733333333333333,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.3206896551724138,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.7631578947368421,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_detail_description",
+ "score": 0.39473684210526305,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.45,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.15000000000000005,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.11428571428571431,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.04285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.2642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.09285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.049999999999999996,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.4142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.10714285714285716,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.36428571428571427,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.049999999999999996,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Spanish",
+ "score": 0.04285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_French",
+ "score": 0.04285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Russian",
+ "score": 0.02142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Chinese",
+ "score": 0.17857142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Arabic",
+ "score": 0.03571428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "red_teaming_visual_order_B",
+ "score": 0.8789473684210528,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.12105263157894743,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8631578947368422,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.6900000000000002,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.62,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.30999999999999994,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_A",
+ "score": 0.8736842105263161,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.625,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/Default/Qwen2_VL_72B/summary_results.json b/static/eval_results/Default/Qwen2_VL_72B/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..21ca30c7af4a3ac146431c7c2c9ef8774c9fdd1e
--- /dev/null
+++ b/static/eval_results/Default/Qwen2_VL_72B/summary_results.json
@@ -0,0 +1,251 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 440,
+ "num_eval_samples": 6539,
+ "macro_mean_score": 0.4542376574527161,
+ "micro_mean_score": 0.4501201906164793
+ },
+ "open": {
+ "num_eval_tasks": 65,
+ "num_eval_samples": 1163,
+ "macro_mean_score": 0.5639771804231668,
+ "micro_mean_score": 0.5835339638865004
+ },
+ "overall_score": 0.4683625465479226
+ },
+ "keyword_stats": {
+ "skills": {
+ "Object Recognition and Classification": {
+ "count": 303,
+ "num_samples": 4755,
+ "tasks": [],
+ "average_score": 0.48669152179713876
+ },
+ "Text Recognition (OCR)": {
+ "count": 137,
+ "num_samples": 2239,
+ "tasks": [],
+ "average_score": 0.5291932917937967
+ },
+ "Language Understanding and Generation": {
+ "count": 154,
+ "num_samples": 2509,
+ "tasks": [],
+ "average_score": 0.53654503409075
+ },
+ "Scene and Event Understanding": {
+ "count": 154,
+ "num_samples": 2467,
+ "tasks": [],
+ "average_score": 0.4931554892760308
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 109,
+ "num_samples": 1910,
+ "tasks": [],
+ "average_score": 0.3908023665629473
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 51,
+ "num_samples": 855,
+ "tasks": [],
+ "average_score": 0.5668846347262286
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 15,
+ "num_samples": 245,
+ "tasks": [],
+ "average_score": 0.6121127819548872
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 77,
+ "num_samples": 1386,
+ "tasks": [],
+ "average_score": 0.4493794346300551
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 152,
+ "num_samples": 2437,
+ "tasks": [],
+ "average_score": 0.33622171962424363
+ },
+ "Planning and Decision Making": {
+ "count": 37,
+ "num_samples": 577,
+ "tasks": [],
+ "average_score": 0.21642754068858566
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 93,
+ "num_samples": 1517,
+ "tasks": [],
+ "average_score": 0.5263730250833892
+ },
+ "Text-Based Images and Documents": {
+ "count": 82,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.42759570727857965
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 101,
+ "num_samples": 1718,
+ "tasks": [],
+ "average_score": 0.4228561177227288
+ },
+ "Videos": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.4780253686541936
+ },
+ "Artistic and Creative Content": {
+ "count": 32,
+ "num_samples": 541,
+ "tasks": [],
+ "average_score": 0.5070774860945021
+ },
+ "Photographs": {
+ "count": 143,
+ "num_samples": 2248,
+ "tasks": [],
+ "average_score": 0.4807292191169126
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 11,
+ "num_samples": 169,
+ "tasks": [],
+ "average_score": 0.38847545874852984
+ }
+ },
+ "output_format": {
+ "contextual_formatted_text": {
+ "count": 98,
+ "num_samples": 1514,
+ "tasks": [],
+ "average_score": 0.4359156358804688
+ },
+ "structured_output": {
+ "count": 110,
+ "num_samples": 1714,
+ "tasks": [],
+ "average_score": 0.43781407268698613
+ },
+ "exact_text": {
+ "count": 83,
+ "num_samples": 1278,
+ "tasks": [],
+ "average_score": 0.49080138099759946
+ },
+ "numerical_data": {
+ "count": 49,
+ "num_samples": 862,
+ "tasks": [],
+ "average_score": 0.42481004254128113
+ },
+ "open_ended_output": {
+ "count": 80,
+ "num_samples": 1454,
+ "tasks": [],
+ "average_score": 0.5132810622684265
+ },
+ "multiple_choice": {
+ "count": 85,
+ "num_samples": 1363,
+ "tasks": [],
+ "average_score": 0.5062248706593999
+ }
+ },
+ "input_num": {
+ "6-8 images": {
+ "count": 21,
+ "num_samples": 314,
+ "tasks": [],
+ "average_score": 0.3063303099017385
+ },
+ "9-image or more": {
+ "count": 41,
+ "num_samples": 623,
+ "tasks": [],
+ "average_score": 0.523959576707116
+ },
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.4879791577413812
+ },
+ "video": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.4780253686541936
+ },
+ "4-5 images": {
+ "count": 34,
+ "num_samples": 520,
+ "tasks": [],
+ "average_score": 0.34846161336322395
+ },
+ "2-3 images": {
+ "count": 51,
+ "num_samples": 802,
+ "tasks": [],
+ "average_score": 0.44101149919132854
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 72,
+ "num_samples": 1124,
+ "tasks": [],
+ "average_score": 0.5663587858366833
+ },
+ "Planning": {
+ "count": 78,
+ "num_samples": 1239,
+ "tasks": [],
+ "average_score": 0.3067825586087303
+ },
+ "Coding": {
+ "count": 31,
+ "num_samples": 474,
+ "tasks": [],
+ "average_score": 0.4121566368482877
+ },
+ "Perception": {
+ "count": 145,
+ "num_samples": 2313,
+ "tasks": [],
+ "average_score": 0.5176521211872086
+ },
+ "Metrics": {
+ "count": 20,
+ "num_samples": 309,
+ "tasks": [],
+ "average_score": 0.5030444649397028
+ },
+ "Science": {
+ "count": 29,
+ "num_samples": 574,
+ "tasks": [],
+ "average_score": 0.45616267568458396
+ },
+ "Knowledge": {
+ "count": 97,
+ "num_samples": 1605,
+ "tasks": [],
+ "average_score": 0.5047683071464567
+ },
+ "Mathematics": {
+ "count": 33,
+ "num_samples": 547,
+ "tasks": [],
+ "average_score": 0.3553838743540432
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/Default/Qwen2_VL_72B/task_results.json b/static/eval_results/Default/Qwen2_VL_72B/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..8a17c72937237724cf8c14214bfd7fc050e640b8
--- /dev/null
+++ b/static/eval_results/Default/Qwen2_VL_72B/task_results.json
@@ -0,0 +1,7756 @@
+[
+ {
+ "name": "face_identity_matching",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "IAM_line_ocr_and_locate",
+ "score": 0.735856751092166,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_retrieval",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_multicamera_tracking_predict_bbox",
+ "score": 0.08008270592840953,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevr_arithmetic",
+ "score": 0.6842105263157895,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.38333333333333336,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualdial_visual_dialog_image_guessing",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_object_existence_video",
+ "score": 0.625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.05299999999999997,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.6938775510204082,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "star_object_interaction_video",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_content_reasoning",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_similar_scene_retrieval",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cam_traj_to_video_selection",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "photo_sharing_image_retrieval",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "perception_test_video_character_order",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "coco_ood_global_image_retrieval_by_query_property",
+ "score": 0.6987244897959183,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "arc_agi",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "vizwiz_quality_accessment_for_blind",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "av_human_multiview_counting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "emotion_recognition",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.78,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.2708333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.46765174197558695,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.16496674091169844,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "dvqa",
+ "score": 0.8947368421052632,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.7827708359722172,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_moving_direction_video",
+ "score": 0.3125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.9480519480519481,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_cycle",
+ "score": 0.33035714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "av_vehicle_multiview_counting",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.34596078503152894,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.8039999999999998,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sta_action_localization_video",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.6697279131770494,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_object_shuffle_video",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_hard",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "video_to_camera_trajectory_retrieval",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "cheapest_flight_identification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_retrieval",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.5777777777777777,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "muma_theory_of_mind_social_goal",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "clevrer_video_moving_object_property_recognition",
+ "score": 0.9375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "action_sequence",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "game_info_retrieval",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "av_view_identification",
+ "score": 0.11111111111111112,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "code_programming_test_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "muma_theory_of_mind_belief_of_goal",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "code_programming_extremely_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.9747899159663866,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.7368421052631579,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_fragments_understanding",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.7784210526315789,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.9047619047619049,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.7404077001845016,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vlnqa_egocentric_navigation_video",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "landmark_check_two_images",
+ "score": 0.7777777777777779,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.7639999999999998,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.7441428571428573,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_load_type_prediction_from_plot",
+ "score": 0.47619047619047616,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.65,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_prediction",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "code_error_line_identification",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.8125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.8839285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.9603174603174605,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_mc",
+ "score": 0.8421052631578947,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_retrieval",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_video_moving_object_count",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 21,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "algebra",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.9812571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.5142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logo2k_same_type_logo_retrieval",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.30314505117346885,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.7969263157894737,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_path",
+ "score": 0.318452380952381,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.7410714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.8333333333333334,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_video_action_count",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.7232142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geographic_remote_sensing_land_cover",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.12857142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.15714285714285717,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.1142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.09999999999999999,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_plane_segmentation",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_novel_view_synthesis",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_3d_assembled_quality_understanding",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visual_prediction_rater_semantic_segmentation",
+ "score": 0.2708333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_panoptic_segmentation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_surface_normal_estimation",
+ "score": 0.16666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_depth_estimation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_openable_part_segmentation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_correspondance_in_two_images",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_circle_reasoning",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vln_tegulu_next_step",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_factual_pref",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_polyp_segmentation_single_object_rater",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "recipe_image_ordering",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_visualization_output_understanding",
+ "score": 0.3,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 10,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_view",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video_grounding_temporal",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.07714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.4411444596352561,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_english_next_step",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "logical_reasoning_2d_folding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "worldle",
+ "score": 0.32276496873129895,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_keywords_based_retrieval_non_radiology",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sign_language",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_camera_motion_description",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_segments_reordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.08595213084035436,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.8583333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_spatial",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_match_problem",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "music_sheet_sentiment",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.22448979591836735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_visual_pref",
+ "score": 0.6875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.49168715680336533,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.45,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "semantic_matching_of_two_images",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.1836734693877551,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_retrieval",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.6842105263157895,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_recommendation",
+ "score": 0.47891156462585033,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "photoshop_operation",
+ "score": 0.17261904761904762,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_homepage_profile",
+ "score": 0.3062298603651987,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vln_hindi_next_step",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "media_recommend_solutions_stackoverflow",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "dish_ingredient_match",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "google_streetview_direction_understanding",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "music_sheet_name",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "code_translation_easy",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ancient_map_understanding",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_2D_views_of_3D_shapes",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "paper_review_acceptance",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "rocks_samples_compare",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_line_reasoning",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_intent_recognition",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "counting",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.6666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_solution_compare",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "2d_image_jigsaw_puzzle_easy",
+ "score": 0.25476190476190474,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_content_based_retrieval_radiology",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.35353535353535354,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiview_reasoning_camera_moving",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rocks_samples_identify",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.44313725490196076,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_rating",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_grippers",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "comic_page_ordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_event",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.29485155847974215,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_robot",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "multilingual_news_qa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "move_pos_to_pos_hanoi_4_pole",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting_multi_image",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_translation_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.519047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_sign_recognition",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.5476190476190476,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_author",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.022873657642764916,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_action_recognition",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "functionality_matching_in_different_objects",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "Movie_retrieval_by_actor",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "TV_show_retrieval_by_character",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "Forensic_Detection_of_different_images",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pokemon_3D_recognition",
+ "score": 0.6166666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_rating",
+ "score": 0.6989250130797713,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "number_comparison",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_basic",
+ "score": 0.7058823529411765,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "waldo",
+ "score": 0.002625410220346934,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_line_sorting",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_eval_dynamic_pref",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.21052631578947367,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_Python",
+ "score": 0.4166666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.5294117647058824,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.15714285714285717,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_parasite_detection",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.29411764705882354,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_location",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "rebus",
+ "score": 0.17391304347826086,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat_position_only_length",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_xor_images",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_images",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_position_only",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_contain_images",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.9047619047619048,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_tiktok",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_notes",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_alipay",
+ "score": 0.5882352941176471,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_amazon",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_instagram",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_ppt",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_leetcode",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_twitter",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_zoom",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_iphone_settings",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_youtube",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_excel",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_word",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.44285714285714295,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6000000000000002,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_T2I_reward",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.8452380952380952,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_motion_matching_3D_real",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "video_motion_matching_real_3D",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ball_cup_swap_3",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.4263605442176871,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.37137188208616784,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.43333333333333335,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_subject",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_control",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_aesthetics",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_mask",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_semantics",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_motion_guided_editing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "autorater_3d_model_texturing",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_unmask",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.15,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.7058823529411765,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_abdomen_MRI_organ_recognition",
+ "score": 0.38095238095238093,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_abdomen_endscopy_organ_recognition",
+ "score": 0.2738095238095238,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_retrieval_given_surgeon_activity",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Videos",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_output_result",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.5789473684210527,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "painting_QA",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_add_tag",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.7586206896551724,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.5642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.7758620689655172,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.4724137931034481,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sceneqa_scene_transition_video",
+ "score": 0.2785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "wikihow_complex_task_completion",
+ "score": 0.6444444444444444,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "funqa_unexpected_action_humor_video",
+ "score": 0.4,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.8571428571428573,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.4928571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.8206896551724137,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "activitynetqa",
+ "score": 0.5894736842105264,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "generated_video_artifacts",
+ "score": 0.21875,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "paper_review_writing",
+ "score": 0.5533333333333332,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.7555555555555555,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_creative_video",
+ "score": 0.31333333333333335,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.7620689655172413,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.7172413793103449,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.7793103448275861,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "doc_vqa",
+ "score": 0.8,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "video2notes",
+ "score": 0.6,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.6413793103448274,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_short_title",
+ "score": 0.6571428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.5466666666666666,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_oe",
+ "score": 0.32105263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.6928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_summary",
+ "score": 0.6142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.6206896551724139,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.5642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_content_follow_up",
+ "score": 0.7857142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.37857142857142856,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_qa",
+ "score": 0.7428571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.7653846153846156,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.7499999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.338,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.7,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.6096774193548388,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_magic_video",
+ "score": 0.4866666666666666,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.6724137931034484,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.8,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_detail_description",
+ "score": 0.4789473684210525,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.75,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.21428571428571427,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.45714285714285713,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.37142857142857133,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.5142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.5857142857142856,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.46428571428571425,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.5928571428571427,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.6285714285714284,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.1785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.23571428571428577,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.2857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Spanish",
+ "score": 0.39285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_French",
+ "score": 0.3714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Russian",
+ "score": 0.32142857142857134,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Chinese",
+ "score": 0.4428571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Arabic",
+ "score": 0.3071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "red_teaming_visual_order_B",
+ "score": 0.8842105263157897,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.905263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.5950000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.6049999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.56,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_A",
+ "score": 0.8842105263157897,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.69,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/Default/Qwen2_VL_7B/summary_results.json b/static/eval_results/Default/Qwen2_VL_7B/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..a67230b05e5f3234888e722eab28419c004ee575
--- /dev/null
+++ b/static/eval_results/Default/Qwen2_VL_7B/summary_results.json
@@ -0,0 +1,251 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 440,
+ "num_eval_samples": 6539,
+ "macro_mean_score": 0.3293449599230247,
+ "micro_mean_score": 0.325331493515679
+ },
+ "open": {
+ "num_eval_tasks": 65,
+ "num_eval_samples": 1170,
+ "macro_mean_score": 0.43955105763038577,
+ "micro_mean_score": 0.45508547008546996
+ },
+ "overall_score": 0.34352990319228904
+ },
+ "keyword_stats": {
+ "skills": {
+ "Object Recognition and Classification": {
+ "count": 303,
+ "num_samples": 4755,
+ "tasks": [],
+ "average_score": 0.3506773570484231
+ },
+ "Text Recognition (OCR)": {
+ "count": 137,
+ "num_samples": 2239,
+ "tasks": [],
+ "average_score": 0.38363163370919123
+ },
+ "Language Understanding and Generation": {
+ "count": 154,
+ "num_samples": 2511,
+ "tasks": [],
+ "average_score": 0.3882785389756705
+ },
+ "Scene and Event Understanding": {
+ "count": 154,
+ "num_samples": 2469,
+ "tasks": [],
+ "average_score": 0.38292659892379843
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 109,
+ "num_samples": 1910,
+ "tasks": [],
+ "average_score": 0.2730765188348748
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 51,
+ "num_samples": 855,
+ "tasks": [],
+ "average_score": 0.4625711182912848
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 15,
+ "num_samples": 245,
+ "tasks": [],
+ "average_score": 0.5287318295739348
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 77,
+ "num_samples": 1386,
+ "tasks": [],
+ "average_score": 0.32297080808954215
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 152,
+ "num_samples": 2439,
+ "tasks": [],
+ "average_score": 0.2561357336105554
+ },
+ "Planning and Decision Making": {
+ "count": 37,
+ "num_samples": 577,
+ "tasks": [],
+ "average_score": 0.12651411144309255
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 93,
+ "num_samples": 1517,
+ "tasks": [],
+ "average_score": 0.35229497847636093
+ },
+ "Text-Based Images and Documents": {
+ "count": 82,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.2881996369284258
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 101,
+ "num_samples": 1718,
+ "tasks": [],
+ "average_score": 0.3162917354476226
+ },
+ "Videos": {
+ "count": 43,
+ "num_samples": 700,
+ "tasks": [],
+ "average_score": 0.3555910609857979
+ },
+ "Artistic and Creative Content": {
+ "count": 32,
+ "num_samples": 541,
+ "tasks": [],
+ "average_score": 0.3513518594470202
+ },
+ "Photographs": {
+ "count": 143,
+ "num_samples": 2248,
+ "tasks": [],
+ "average_score": 0.39509504888372243
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 11,
+ "num_samples": 169,
+ "tasks": [],
+ "average_score": 0.19173322639974366
+ }
+ },
+ "output_format": {
+ "contextual_formatted_text": {
+ "count": 98,
+ "num_samples": 1514,
+ "tasks": [],
+ "average_score": 0.3118818521697947
+ },
+ "structured_output": {
+ "count": 110,
+ "num_samples": 1714,
+ "tasks": [],
+ "average_score": 0.3323478338046426
+ },
+ "exact_text": {
+ "count": 83,
+ "num_samples": 1278,
+ "tasks": [],
+ "average_score": 0.31975345327634014
+ },
+ "numerical_data": {
+ "count": 49,
+ "num_samples": 862,
+ "tasks": [],
+ "average_score": 0.3207400992620562
+ },
+ "open_ended_output": {
+ "count": 80,
+ "num_samples": 1456,
+ "tasks": [],
+ "average_score": 0.39680785337230745
+ },
+ "multiple_choice": {
+ "count": 85,
+ "num_samples": 1363,
+ "tasks": [],
+ "average_score": 0.38069986029874947
+ }
+ },
+ "input_num": {
+ "6-8 images": {
+ "count": 21,
+ "num_samples": 314,
+ "tasks": [],
+ "average_score": 0.21448412698412703
+ },
+ "9-image or more": {
+ "count": 41,
+ "num_samples": 623,
+ "tasks": [],
+ "average_score": 0.34991843422677277
+ },
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.36487656334089386
+ },
+ "video": {
+ "count": 43,
+ "num_samples": 700,
+ "tasks": [],
+ "average_score": 0.3555910609857979
+ },
+ "4-5 images": {
+ "count": 34,
+ "num_samples": 520,
+ "tasks": [],
+ "average_score": 0.23950364354876252
+ },
+ "2-3 images": {
+ "count": 51,
+ "num_samples": 802,
+ "tasks": [],
+ "average_score": 0.31886513111201115
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 72,
+ "num_samples": 1124,
+ "tasks": [],
+ "average_score": 0.3972495309304478
+ },
+ "Planning": {
+ "count": 78,
+ "num_samples": 1239,
+ "tasks": [],
+ "average_score": 0.18098305857595157
+ },
+ "Coding": {
+ "count": 31,
+ "num_samples": 474,
+ "tasks": [],
+ "average_score": 0.30887234822244314
+ },
+ "Perception": {
+ "count": 145,
+ "num_samples": 2315,
+ "tasks": [],
+ "average_score": 0.39256038521661607
+ },
+ "Metrics": {
+ "count": 20,
+ "num_samples": 309,
+ "tasks": [],
+ "average_score": 0.44924313486983725
+ },
+ "Science": {
+ "count": 29,
+ "num_samples": 574,
+ "tasks": [],
+ "average_score": 0.2880278656037017
+ },
+ "Knowledge": {
+ "count": 97,
+ "num_samples": 1605,
+ "tasks": [],
+ "average_score": 0.4015531477048036
+ },
+ "Mathematics": {
+ "count": 33,
+ "num_samples": 547,
+ "tasks": [],
+ "average_score": 0.24179792538224956
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/Default/Qwen2_VL_7B/task_results.json b/static/eval_results/Default/Qwen2_VL_7B/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..48d910a2b4b14cfe26a2d195778a1e9ea647c0ac
--- /dev/null
+++ b/static/eval_results/Default/Qwen2_VL_7B/task_results.json
@@ -0,0 +1,7756 @@
+[
+ {
+ "name": "face_identity_matching",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "IAM_line_ocr_and_locate",
+ "score": 0.3953838788800739,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_multicamera_tracking_predict_bbox",
+ "score": 0.026547987444069228,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevr_arithmetic",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualdial_visual_dialog_image_guessing",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_object_existence_video",
+ "score": 0.8125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.6224489795918366,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "star_object_interaction_video",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_content_reasoning",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_similar_scene_retrieval",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cam_traj_to_video_selection",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "photo_sharing_image_retrieval",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "perception_test_video_character_order",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "coco_ood_global_image_retrieval_by_query_property",
+ "score": 0.5370954442383014,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "arc_agi",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "vizwiz_quality_accessment_for_blind",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "av_human_multiview_counting",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "emotion_recognition",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.64,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.20833333333333334,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.37716272271985773,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.12041002821230626,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "dvqa",
+ "score": 0.8947368421052632,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.7728401943343881,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_moving_direction_video",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.8766233766233765,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_cycle",
+ "score": 0.33035714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "av_vehicle_multiview_counting",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.19897365891504984,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.6093333333333334,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sta_action_localization_video",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.7543233207204482,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_object_shuffle_video",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "video_to_camera_trajectory_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "cheapest_flight_identification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.6444444444444443,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "muma_theory_of_mind_social_goal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "clevrer_video_moving_object_property_recognition",
+ "score": 0.9375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "action_sequence",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "game_info_retrieval",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "av_view_identification",
+ "score": 0.12222222222222223,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "code_programming_test_advanced",
+ "score": 0.018518518518518517,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "muma_theory_of_mind_belief_of_goal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "code_programming_extremely_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.9747899159663866,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_fragments_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.7726315789473684,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.746031746031746,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.6253043759566965,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vlnqa_egocentric_navigation_video",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "landmark_check_two_images",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.5065263157894736,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.6086428571428572,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_load_type_prediction_from_plot",
+ "score": 0.4523809523809523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.6285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_prediction",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "code_error_line_identification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.78125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.7410714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.9722222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_mc",
+ "score": 0.8421052631578947,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_retrieval",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_video_moving_object_count",
+ "score": 0.47619047619047616,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 21,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "algebra",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.9400428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.45385714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logo2k_same_type_logo_retrieval",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.6399368421052632,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_path",
+ "score": 0.3928571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.3684210526315789,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.7232142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.761904761904762,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_video_action_count",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.5982142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.4666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geographic_remote_sensing_land_cover",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.05714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.05714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_plane_segmentation",
+ "score": 0.08888888888888888,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_novel_view_synthesis",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_3d_assembled_quality_understanding",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visual_prediction_rater_semantic_segmentation",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_panoptic_segmentation",
+ "score": 0.023809523809523808,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_surface_normal_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_depth_estimation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_openable_part_segmentation",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_correspondance_in_two_images",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_circle_reasoning",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vln_tegulu_next_step",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_factual_pref",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_polyp_segmentation_single_object_rater",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "recipe_image_ordering",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_visualization_output_understanding",
+ "score": 0.3,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 10,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_view",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video_grounding_temporal",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.06598639455782314,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_blocksworld",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.35939538929969556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_english_next_step",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "logical_reasoning_2d_folding",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "worldle",
+ "score": 0.2838251878691227,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_keywords_based_retrieval_non_radiology",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sign_language",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_camera_motion_description",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_segments_reordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.032670025538873985,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.8869047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_spatial",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_match_problem",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "music_sheet_sentiment",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.02040816326530612,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_visual_pref",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.2756576749919229,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.45,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "semantic_matching_of_two_images",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.02040816326530612,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_retrieval",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.7894736842105263,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_recommendation",
+ "score": 0.36434240362811793,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "photoshop_operation",
+ "score": 0.17261904761904762,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_homepage_profile",
+ "score": 0.13337585034013605,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vln_hindi_next_step",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "media_recommend_solutions_stackoverflow",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "dish_ingredient_match",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "google_streetview_direction_understanding",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "music_sheet_name",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "code_translation_easy",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ancient_map_understanding",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_2D_views_of_3D_shapes",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "paper_review_acceptance",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "rocks_samples_compare",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_line_reasoning",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_intent_recognition",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "counting",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.6904761904761905,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_solution_compare",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "2d_image_jigsaw_puzzle_easy",
+ "score": 0.1845238095238095,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_content_based_retrieval_radiology",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.25252525252525254,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiview_reasoning_camera_moving",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rocks_samples_identify",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.39166666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_rating",
+ "score": 0.8928571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_grippers",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "comic_page_ordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_event",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.11383541415414918,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_robot",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "multilingual_news_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "move_pos_to_pos_hanoi_4_pole",
+ "score": 0.0503968253968254,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting_multi_image",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_translation_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.2261904761904762,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_sign_recognition",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.3095238095238096,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_author",
+ "score": 0.3125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.03296776632380673,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_action_recognition",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "functionality_matching_in_different_objects",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "Movie_retrieval_by_actor",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "TV_show_retrieval_by_character",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "Forensic_Detection_of_different_images",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0066666666666666775,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pokemon_3D_recognition",
+ "score": 0.35,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_rating",
+ "score": 0.7142674593015049,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_basic",
+ "score": 0.35294117647058826,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "waldo",
+ "score": 0.000811738675187593,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_line_sorting",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_eval_dynamic_pref",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.21052631578947367,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_Python",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.4117647058823529,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.3714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_parasite_detection",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.23529411764705882,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_location",
+ "score": 0.08484848484848485,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "rebus",
+ "score": 0.08695652173913043,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat_position_only_length",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_xor_images",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_images",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_position_only",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_contain_images",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_tiktok",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_notes",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_alipay",
+ "score": 0.23529411764705882,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_amazon",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_instagram",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_ppt",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_leetcode",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_twitter",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_zoom",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_iphone_settings",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_youtube",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_excel",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_word",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.07857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6352941176470588,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_T2I_reward",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.6904761904761906,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_motion_matching_3D_real",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "video_motion_matching_real_3D",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ball_cup_swap_3",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.2787414965986395,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.2790532879818594,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_subject",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_control",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_aesthetics",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_mask",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_semantics",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_motion_guided_editing",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "autorater_3d_model_texturing",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_unmask",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.05,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_abdomen_MRI_organ_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_abdomen_endscopy_organ_recognition",
+ "score": 0.047619047619047616,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_retrieval_given_surgeon_activity",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Videos",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_output_result",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.3684210526315789,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "painting_QA",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_add_tag",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.6785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.4310344827586208,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_magic_video",
+ "score": 0.37999999999999995,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.5620689655172414,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.7947368421052632,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.5793103448275861,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.506896551724138,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_detail_description",
+ "score": 0.4368421052631579,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.2666666666666667,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video2notes",
+ "score": 0.43571428571428567,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "activitynetqa",
+ "score": 0.445,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.6,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.6857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_qa",
+ "score": 0.7142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.3724137931034482,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.4206896551724137,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.8285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.5499999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "wikihow_complex_task_completion",
+ "score": 0.33333333333333326,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.5517241379310345,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.19285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "doc_vqa",
+ "score": 0.7250000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.6642857142857144,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.6500000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.3068965517241378,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.12999999999999998,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_creative_video",
+ "score": 0.3333333333333333,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_summary",
+ "score": 0.5499999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.29285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.6931034482758621,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_humor_video",
+ "score": 0.28,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "paper_review_writing",
+ "score": 0.15333333333333335,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.4548387096774194,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.3714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.5571428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "generated_video_artifacts",
+ "score": 0.31249999999999994,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.5333333333333333,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_short_title",
+ "score": 0.6357142857142856,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "sceneqa_scene_transition_video",
+ "score": 0.23750000000000002,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "nextqa_oe",
+ "score": 0.2899999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_content_follow_up",
+ "score": 0.4428571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.7449999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_B",
+ "score": 0.865,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.905263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.7449999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_A",
+ "score": 0.8850000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.6399999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000002,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.255,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.2642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.35000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.41428571428571426,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.4071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.35000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.22142857142857145,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Russian",
+ "score": 0.1357142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_French",
+ "score": 0.22857142857142856,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Spanish",
+ "score": 0.1642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Chinese",
+ "score": 0.2214285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Arabic",
+ "score": 0.1,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.6071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.24285714285714288,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.2071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.13571428571428573,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/Default/all_model_keywords_stats.json b/static/eval_results/Default/all_model_keywords_stats.json
deleted file mode 100644
index 0fd965fa68464b34b7227d2a08b3cd074a9bb74f..0000000000000000000000000000000000000000
--- a/static/eval_results/Default/all_model_keywords_stats.json
+++ /dev/null
@@ -1,5384 +0,0 @@
-{
- "GPT_4o": {
- "skills": {
- "Object Recognition and Classification": {
- "count": 303,
- "num_samples": 4755,
- "tasks": [],
- "average_score": 0.5630758211022604
- },
- "Text Recognition (OCR)": {
- "count": 137,
- "num_samples": 2239,
- "tasks": [],
- "average_score": 0.6216411634729735
- },
- "Language Understanding and Generation": {
- "count": 154,
- "num_samples": 2509,
- "tasks": [],
- "average_score": 0.616018277142757
- },
- "Scene and Event Understanding": {
- "count": 154,
- "num_samples": 2467,
- "tasks": [],
- "average_score": 0.5823101249498799
- },
- "Mathematical and Logical Reasoning": {
- "count": 109,
- "num_samples": 1910,
- "tasks": [],
- "average_score": 0.44177544539510955
- },
- "Commonsense and Social Reasoning": {
- "count": 51,
- "num_samples": 855,
- "tasks": [],
- "average_score": 0.6345458069232931
- },
- "Ethical and Safety Reasoning": {
- "count": 15,
- "num_samples": 245,
- "tasks": [],
- "average_score": 0.6795263157894738
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 77,
- "num_samples": 1386,
- "tasks": [],
- "average_score": 0.5514924675940659
- },
- "Spatial and Temporal Reasoning": {
- "count": 152,
- "num_samples": 2437,
- "tasks": [],
- "average_score": 0.39435038953269674
- },
- "Planning and Decision Making": {
- "count": 37,
- "num_samples": 577,
- "tasks": [],
- "average_score": 0.22934807257231926
- }
- },
- "input_format": {
- "User Interface Screenshots": {
- "count": 93,
- "num_samples": 1517,
- "tasks": [],
- "average_score": 0.608083455060831
- },
- "Text-Based Images and Documents": {
- "count": 82,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.491325251564869
- },
- "Diagrams and Data Visualizations": {
- "count": 101,
- "num_samples": 1718,
- "tasks": [],
- "average_score": 0.4999089647103332
- },
- "Videos": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.5315979872161023
- },
- "Artistic and Creative Content": {
- "count": 32,
- "num_samples": 541,
- "tasks": [],
- "average_score": 0.5641404607063637
- },
- "Photographs": {
- "count": 143,
- "num_samples": 2248,
- "tasks": [],
- "average_score": 0.5613545677222056
- },
- "3D Models and Aerial Imagery": {
- "count": 11,
- "num_samples": 169,
- "tasks": [],
- "average_score": 0.47760591698367955
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 98,
- "num_samples": 1514,
- "tasks": [],
- "average_score": 0.5388690453811203
- },
- "structured_output": {
- "count": 110,
- "num_samples": 1714,
- "tasks": [],
- "average_score": 0.48037685656449847
- },
- "exact_text": {
- "count": 83,
- "num_samples": 1278,
- "tasks": [],
- "average_score": 0.5994159671881645
- },
- "numerical_data": {
- "count": 49,
- "num_samples": 862,
- "tasks": [],
- "average_score": 0.44606605087301393
- },
- "open_ended_output": {
- "count": 80,
- "num_samples": 1454,
- "tasks": [],
- "average_score": 0.6274371950293718
- },
- "multiple_choice": {
- "count": 85,
- "num_samples": 1363,
- "tasks": [],
- "average_score": 0.5448877153826162
- }
- },
- "input_num": {
- "6-8 images": {
- "count": 21,
- "num_samples": 314,
- "tasks": [],
- "average_score": 0.4751133786848073
- },
- "9-image or more": {
- "count": 41,
- "num_samples": 623,
- "tasks": [],
- "average_score": 0.5343350103400748
- },
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.5672657028463585
- },
- "video": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.5315979872161023
- },
- "4-5 images": {
- "count": 34,
- "num_samples": 520,
- "tasks": [],
- "average_score": 0.4500928191484624
- },
- "2-3 images": {
- "count": 51,
- "num_samples": 802,
- "tasks": [],
- "average_score": 0.4908653289106883
- }
- },
- "app": {
- "Information_Extraction": {
- "count": 72,
- "num_samples": 1124,
- "tasks": [],
- "average_score": 0.7056027785545881
- },
- "Planning": {
- "count": 78,
- "num_samples": 1239,
- "tasks": [],
- "average_score": 0.33202130899313653
- },
- "Coding": {
- "count": 31,
- "num_samples": 474,
- "tasks": [],
- "average_score": 0.5032849161169843
- },
- "Perception": {
- "count": 145,
- "num_samples": 2313,
- "tasks": [],
- "average_score": 0.5510350848991218
- },
- "Metrics": {
- "count": 20,
- "num_samples": 309,
- "tasks": [],
- "average_score": 0.6095778863474799
- },
- "Science": {
- "count": 29,
- "num_samples": 574,
- "tasks": [],
- "average_score": 0.5283797185155754
- },
- "Knowledge": {
- "count": 97,
- "num_samples": 1605,
- "tasks": [],
- "average_score": 0.6135723164021851
- },
- "Mathematics": {
- "count": 33,
- "num_samples": 547,
- "tasks": [],
- "average_score": 0.44047720383044436
- }
- }
- },
- "Gemini_1.5_pro_002": {
- "skills": {
- "Object Recognition and Classification": {
- "count": 303,
- "num_samples": 4755,
- "tasks": [],
- "average_score": 0.5202055934299538
- },
- "Text Recognition (OCR)": {
- "count": 137,
- "num_samples": 2239,
- "tasks": [],
- "average_score": 0.5017043129027509
- },
- "Language Understanding and Generation": {
- "count": 154,
- "num_samples": 2509,
- "tasks": [],
- "average_score": 0.5532599716027446
- },
- "Scene and Event Understanding": {
- "count": 154,
- "num_samples": 2467,
- "tasks": [],
- "average_score": 0.546753787203128
- },
- "Mathematical and Logical Reasoning": {
- "count": 109,
- "num_samples": 1910,
- "tasks": [],
- "average_score": 0.425969084163906
- },
- "Commonsense and Social Reasoning": {
- "count": 51,
- "num_samples": 855,
- "tasks": [],
- "average_score": 0.5751012914154264
- },
- "Ethical and Safety Reasoning": {
- "count": 15,
- "num_samples": 245,
- "tasks": [],
- "average_score": 0.6982330827067671
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 77,
- "num_samples": 1386,
- "tasks": [],
- "average_score": 0.513647745999633
- },
- "Spatial and Temporal Reasoning": {
- "count": 152,
- "num_samples": 2437,
- "tasks": [],
- "average_score": 0.3845337030093212
- },
- "Planning and Decision Making": {
- "count": 37,
- "num_samples": 577,
- "tasks": [],
- "average_score": 0.23899503258223884
- }
- },
- "input_format": {
- "User Interface Screenshots": {
- "count": 93,
- "num_samples": 1517,
- "tasks": [],
- "average_score": 0.4625032188638111
- },
- "Text-Based Images and Documents": {
- "count": 82,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.4292353723689881
- },
- "Diagrams and Data Visualizations": {
- "count": 101,
- "num_samples": 1718,
- "tasks": [],
- "average_score": 0.4869625906903554
- },
- "Videos": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.5028718355967439
- },
- "Artistic and Creative Content": {
- "count": 32,
- "num_samples": 541,
- "tasks": [],
- "average_score": 0.5584779204331461
- },
- "Photographs": {
- "count": 143,
- "num_samples": 2248,
- "tasks": [],
- "average_score": 0.55005349042813
- },
- "3D Models and Aerial Imagery": {
- "count": 11,
- "num_samples": 169,
- "tasks": [],
- "average_score": 0.4292127751495457
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 98,
- "num_samples": 1514,
- "tasks": [],
- "average_score": 0.44896309957892694
- },
- "structured_output": {
- "count": 110,
- "num_samples": 1714,
- "tasks": [],
- "average_score": 0.44418591808616864
- },
- "exact_text": {
- "count": 83,
- "num_samples": 1278,
- "tasks": [],
- "average_score": 0.5146447350354234
- },
- "numerical_data": {
- "count": 49,
- "num_samples": 862,
- "tasks": [],
- "average_score": 0.4688623462674191
- },
- "open_ended_output": {
- "count": 80,
- "num_samples": 1454,
- "tasks": [],
- "average_score": 0.5580414823700747
- },
- "multiple_choice": {
- "count": 85,
- "num_samples": 1363,
- "tasks": [],
- "average_score": 0.5538255562099124
- }
- },
- "input_num": {
- "6-8 images": {
- "count": 21,
- "num_samples": 314,
- "tasks": [],
- "average_score": 0.39066515495086923
- },
- "9-image or more": {
- "count": 41,
- "num_samples": 623,
- "tasks": [],
- "average_score": 0.5370278962809547
- },
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.5034399620483027
- },
- "video": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.5028718355967439
- },
- "4-5 images": {
- "count": 34,
- "num_samples": 520,
- "tasks": [],
- "average_score": 0.4885398161821004
- },
- "2-3 images": {
- "count": 51,
- "num_samples": 802,
- "tasks": [],
- "average_score": 0.45544217378728585
- }
- },
- "app": {
- "Information_Extraction": {
- "count": 72,
- "num_samples": 1124,
- "tasks": [],
- "average_score": 0.5421439953094952
- },
- "Planning": {
- "count": 78,
- "num_samples": 1239,
- "tasks": [],
- "average_score": 0.3335324339429373
- },
- "Coding": {
- "count": 31,
- "num_samples": 474,
- "tasks": [],
- "average_score": 0.43465181771633377
- },
- "Perception": {
- "count": 145,
- "num_samples": 2313,
- "tasks": [],
- "average_score": 0.5250631828331306
- },
- "Metrics": {
- "count": 20,
- "num_samples": 309,
- "tasks": [],
- "average_score": 0.5821004797173627
- },
- "Science": {
- "count": 29,
- "num_samples": 574,
- "tasks": [],
- "average_score": 0.5124355410095621
- },
- "Knowledge": {
- "count": 97,
- "num_samples": 1605,
- "tasks": [],
- "average_score": 0.5722329455291694
- },
- "Mathematics": {
- "count": 33,
- "num_samples": 547,
- "tasks": [],
- "average_score": 0.41210885517904977
- }
- }
- },
- "Gemini_1.5_flash_002": {
- "skills": {
- "Object Recognition and Classification": {
- "count": 303,
- "num_samples": 4755,
- "tasks": [],
- "average_score": 0.46250942866818673
- },
- "Text Recognition (OCR)": {
- "count": 137,
- "num_samples": 2239,
- "tasks": [],
- "average_score": 0.4337278553354258
- },
- "Language Understanding and Generation": {
- "count": 154,
- "num_samples": 2509,
- "tasks": [],
- "average_score": 0.49947464681475356
- },
- "Scene and Event Understanding": {
- "count": 154,
- "num_samples": 2467,
- "tasks": [],
- "average_score": 0.5098686082319499
- },
- "Mathematical and Logical Reasoning": {
- "count": 109,
- "num_samples": 1910,
- "tasks": [],
- "average_score": 0.34393279682972117
- },
- "Commonsense and Social Reasoning": {
- "count": 51,
- "num_samples": 855,
- "tasks": [],
- "average_score": 0.5594391803821158
- },
- "Ethical and Safety Reasoning": {
- "count": 15,
- "num_samples": 245,
- "tasks": [],
- "average_score": 0.6380250626566416
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 77,
- "num_samples": 1386,
- "tasks": [],
- "average_score": 0.44816564352475535
- },
- "Spatial and Temporal Reasoning": {
- "count": 152,
- "num_samples": 2437,
- "tasks": [],
- "average_score": 0.34510790215980036
- },
- "Planning and Decision Making": {
- "count": 37,
- "num_samples": 577,
- "tasks": [],
- "average_score": 0.18973764406890803
- }
- },
- "input_format": {
- "User Interface Screenshots": {
- "count": 93,
- "num_samples": 1517,
- "tasks": [],
- "average_score": 0.3865262916591035
- },
- "Text-Based Images and Documents": {
- "count": 82,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.3598139859097534
- },
- "Diagrams and Data Visualizations": {
- "count": 101,
- "num_samples": 1718,
- "tasks": [],
- "average_score": 0.4013870708864889
- },
- "Videos": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.4903530871753026
- },
- "Artistic and Creative Content": {
- "count": 32,
- "num_samples": 541,
- "tasks": [],
- "average_score": 0.5051202896842343
- },
- "Photographs": {
- "count": 143,
- "num_samples": 2248,
- "tasks": [],
- "average_score": 0.5166044655846657
- },
- "3D Models and Aerial Imagery": {
- "count": 11,
- "num_samples": 169,
- "tasks": [],
- "average_score": 0.3849084036535956
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 98,
- "num_samples": 1514,
- "tasks": [],
- "average_score": 0.3869438864407766
- },
- "structured_output": {
- "count": 110,
- "num_samples": 1714,
- "tasks": [],
- "average_score": 0.39868324168390534
- },
- "exact_text": {
- "count": 83,
- "num_samples": 1278,
- "tasks": [],
- "average_score": 0.44793686445264996
- },
- "numerical_data": {
- "count": 49,
- "num_samples": 862,
- "tasks": [],
- "average_score": 0.3704146726364947
- },
- "open_ended_output": {
- "count": 80,
- "num_samples": 1454,
- "tasks": [],
- "average_score": 0.5448638967636353
- },
- "multiple_choice": {
- "count": 85,
- "num_samples": 1363,
- "tasks": [],
- "average_score": 0.47829883834573317
- }
- },
- "input_num": {
- "6-8 images": {
- "count": 21,
- "num_samples": 314,
- "tasks": [],
- "average_score": 0.33669690098261523
- },
- "9-image or more": {
- "count": 41,
- "num_samples": 623,
- "tasks": [],
- "average_score": 0.43653808057103954
- },
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.4427944359714585
- },
- "video": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.4903530871753026
- },
- "4-5 images": {
- "count": 34,
- "num_samples": 520,
- "tasks": [],
- "average_score": 0.42346517633403413
- },
- "2-3 images": {
- "count": 51,
- "num_samples": 802,
- "tasks": [],
- "average_score": 0.41994719346489817
- }
- },
- "app": {
- "Information_Extraction": {
- "count": 72,
- "num_samples": 1124,
- "tasks": [],
- "average_score": 0.46645473820179373
- },
- "Planning": {
- "count": 78,
- "num_samples": 1239,
- "tasks": [],
- "average_score": 0.2517485212411566
- },
- "Coding": {
- "count": 31,
- "num_samples": 474,
- "tasks": [],
- "average_score": 0.40372378342017806
- },
- "Perception": {
- "count": 145,
- "num_samples": 2313,
- "tasks": [],
- "average_score": 0.4799408254775632
- },
- "Metrics": {
- "count": 20,
- "num_samples": 309,
- "tasks": [],
- "average_score": 0.6010361821632402
- },
- "Science": {
- "count": 29,
- "num_samples": 574,
- "tasks": [],
- "average_score": 0.4569546533897065
- },
- "Knowledge": {
- "count": 97,
- "num_samples": 1605,
- "tasks": [],
- "average_score": 0.511590428993871
- },
- "Mathematics": {
- "count": 33,
- "num_samples": 547,
- "tasks": [],
- "average_score": 0.33710867194177685
- }
- }
- },
- "Claude_3.5": {
- "skills": {
- "Object Recognition and Classification": {
- "count": 303,
- "num_samples": 4755,
- "tasks": [],
- "average_score": 0.5405089647404562
- },
- "Text Recognition (OCR)": {
- "count": 137,
- "num_samples": 2239,
- "tasks": [],
- "average_score": 0.6082834220752651
- },
- "Language Understanding and Generation": {
- "count": 154,
- "num_samples": 2509,
- "tasks": [],
- "average_score": 0.5745077617490254
- },
- "Scene and Event Understanding": {
- "count": 154,
- "num_samples": 2467,
- "tasks": [],
- "average_score": 0.5450038475783499
- },
- "Mathematical and Logical Reasoning": {
- "count": 109,
- "num_samples": 1910,
- "tasks": [],
- "average_score": 0.4767692987630454
- },
- "Commonsense and Social Reasoning": {
- "count": 51,
- "num_samples": 855,
- "tasks": [],
- "average_score": 0.5756126284078804
- },
- "Ethical and Safety Reasoning": {
- "count": 15,
- "num_samples": 245,
- "tasks": [],
- "average_score": 0.6969774436090224
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 77,
- "num_samples": 1386,
- "tasks": [],
- "average_score": 0.5278843049497918
- },
- "Spatial and Temporal Reasoning": {
- "count": 152,
- "num_samples": 2437,
- "tasks": [],
- "average_score": 0.4082144793870471
- },
- "Planning and Decision Making": {
- "count": 37,
- "num_samples": 577,
- "tasks": [],
- "average_score": 0.23803578664609892
- }
- },
- "input_format": {
- "User Interface Screenshots": {
- "count": 93,
- "num_samples": 1517,
- "tasks": [],
- "average_score": 0.5691641481808987
- },
- "Text-Based Images and Documents": {
- "count": 82,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.4795267886975966
- },
- "Diagrams and Data Visualizations": {
- "count": 101,
- "num_samples": 1718,
- "tasks": [],
- "average_score": 0.525848282456283
- },
- "Videos": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.508735695828719
- },
- "Artistic and Creative Content": {
- "count": 32,
- "num_samples": 541,
- "tasks": [],
- "average_score": 0.5699094130430454
- },
- "Photographs": {
- "count": 143,
- "num_samples": 2248,
- "tasks": [],
- "average_score": 0.5096772701625744
- },
- "3D Models and Aerial Imagery": {
- "count": 11,
- "num_samples": 169,
- "tasks": [],
- "average_score": 0.4429640420975014
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 98,
- "num_samples": 1514,
- "tasks": [],
- "average_score": 0.5066797418318023
- },
- "structured_output": {
- "count": 110,
- "num_samples": 1714,
- "tasks": [],
- "average_score": 0.4971460788134188
- },
- "exact_text": {
- "count": 83,
- "num_samples": 1278,
- "tasks": [],
- "average_score": 0.5278127103234661
- },
- "numerical_data": {
- "count": 49,
- "num_samples": 862,
- "tasks": [],
- "average_score": 0.4490020843308984
- },
- "open_ended_output": {
- "count": 80,
- "num_samples": 1454,
- "tasks": [],
- "average_score": 0.5838224169821388
- },
- "multiple_choice": {
- "count": 85,
- "num_samples": 1363,
- "tasks": [],
- "average_score": 0.5456152399978661
- }
- },
- "input_num": {
- "6-8 images": {
- "count": 21,
- "num_samples": 314,
- "tasks": [],
- "average_score": 0.46300075585789874
- },
- "9-image or more": {
- "count": 41,
- "num_samples": 623,
- "tasks": [],
- "average_score": 0.5414381873407914
- },
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.5373019912310933
- },
- "video": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.508735695828719
- },
- "4-5 images": {
- "count": 34,
- "num_samples": 520,
- "tasks": [],
- "average_score": 0.4422556748863689
- },
- "2-3 images": {
- "count": 51,
- "num_samples": 802,
- "tasks": [],
- "average_score": 0.49311554035078103
- }
- },
- "app": {
- "Information_Extraction": {
- "count": 72,
- "num_samples": 1124,
- "tasks": [],
- "average_score": 0.6663170946790707
- },
- "Planning": {
- "count": 78,
- "num_samples": 1239,
- "tasks": [],
- "average_score": 0.3382015835012861
- },
- "Coding": {
- "count": 31,
- "num_samples": 474,
- "tasks": [],
- "average_score": 0.5194010220575684
- },
- "Perception": {
- "count": 145,
- "num_samples": 2313,
- "tasks": [],
- "average_score": 0.532329797132399
- },
- "Metrics": {
- "count": 20,
- "num_samples": 309,
- "tasks": [],
- "average_score": 0.5808831682303479
- },
- "Science": {
- "count": 29,
- "num_samples": 574,
- "tasks": [],
- "average_score": 0.513474611293123
- },
- "Knowledge": {
- "count": 97,
- "num_samples": 1605,
- "tasks": [],
- "average_score": 0.5507075880782885
- },
- "Mathematics": {
- "count": 33,
- "num_samples": 547,
- "tasks": [],
- "average_score": 0.47461998432626556
- }
- }
- },
- "Claude_3.5_new": {
- "skills": {
- "Object Recognition and Classification": {
- "count": 303,
- "num_samples": 4755,
- "tasks": [],
- "average_score": 0.5690045172520449
- },
- "Text Recognition (OCR)": {
- "count": 137,
- "num_samples": 2239,
- "tasks": [],
- "average_score": 0.6220681231036606
- },
- "Language Understanding and Generation": {
- "count": 154,
- "num_samples": 2509,
- "tasks": [],
- "average_score": 0.6077980666415158
- },
- "Scene and Event Understanding": {
- "count": 154,
- "num_samples": 2467,
- "tasks": [],
- "average_score": 0.5511440615639541
- },
- "Mathematical and Logical Reasoning": {
- "count": 109,
- "num_samples": 1910,
- "tasks": [],
- "average_score": 0.4885536652013625
- },
- "Commonsense and Social Reasoning": {
- "count": 51,
- "num_samples": 855,
- "tasks": [],
- "average_score": 0.5908204006544897
- },
- "Ethical and Safety Reasoning": {
- "count": 15,
- "num_samples": 245,
- "tasks": [],
- "average_score": 0.6569473684210526
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 77,
- "num_samples": 1386,
- "tasks": [],
- "average_score": 0.5486763511384175
- },
- "Spatial and Temporal Reasoning": {
- "count": 152,
- "num_samples": 2437,
- "tasks": [],
- "average_score": 0.4315385951907387
- },
- "Planning and Decision Making": {
- "count": 37,
- "num_samples": 577,
- "tasks": [],
- "average_score": 0.2909419331017877
- }
- },
- "input_format": {
- "User Interface Screenshots": {
- "count": 93,
- "num_samples": 1517,
- "tasks": [],
- "average_score": 0.6048192628845258
- },
- "Text-Based Images and Documents": {
- "count": 82,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.48924295292319175
- },
- "Diagrams and Data Visualizations": {
- "count": 101,
- "num_samples": 1718,
- "tasks": [],
- "average_score": 0.556418710368288
- },
- "Videos": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.4946691340754988
- },
- "Artistic and Creative Content": {
- "count": 32,
- "num_samples": 541,
- "tasks": [],
- "average_score": 0.5558756390298104
- },
- "Photographs": {
- "count": 143,
- "num_samples": 2248,
- "tasks": [],
- "average_score": 0.5425198547046186
- },
- "3D Models and Aerial Imagery": {
- "count": 11,
- "num_samples": 169,
- "tasks": [],
- "average_score": 0.44210335381541843
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 98,
- "num_samples": 1514,
- "tasks": [],
- "average_score": 0.5187252051932875
- },
- "structured_output": {
- "count": 110,
- "num_samples": 1714,
- "tasks": [],
- "average_score": 0.5071121107460066
- },
- "exact_text": {
- "count": 83,
- "num_samples": 1278,
- "tasks": [],
- "average_score": 0.5387340524651681
- },
- "numerical_data": {
- "count": 49,
- "num_samples": 862,
- "tasks": [],
- "average_score": 0.4824302644151348
- },
- "open_ended_output": {
- "count": 80,
- "num_samples": 1454,
- "tasks": [],
- "average_score": 0.6242798397166945
- },
- "multiple_choice": {
- "count": 85,
- "num_samples": 1363,
- "tasks": [],
- "average_score": 0.5782691045270721
- }
- },
- "input_num": {
- "6-8 images": {
- "count": 21,
- "num_samples": 314,
- "tasks": [],
- "average_score": 0.4630277507828528
- },
- "9-image or more": {
- "count": 41,
- "num_samples": 623,
- "tasks": [],
- "average_score": 0.5914338446093256
- },
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.5636254729390459
- },
- "video": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.4946691340754988
- },
- "4-5 images": {
- "count": 34,
- "num_samples": 520,
- "tasks": [],
- "average_score": 0.4828123870640382
- },
- "2-3 images": {
- "count": 51,
- "num_samples": 802,
- "tasks": [],
- "average_score": 0.48756636014597515
- }
- },
- "app": {
- "Information_Extraction": {
- "count": 72,
- "num_samples": 1124,
- "tasks": [],
- "average_score": 0.6590137441693218
- },
- "Planning": {
- "count": 78,
- "num_samples": 1239,
- "tasks": [],
- "average_score": 0.39901670035164916
- },
- "Coding": {
- "count": 31,
- "num_samples": 474,
- "tasks": [],
- "average_score": 0.5166853031535193
- },
- "Perception": {
- "count": 145,
- "num_samples": 2313,
- "tasks": [],
- "average_score": 0.5561634744977417
- },
- "Metrics": {
- "count": 20,
- "num_samples": 309,
- "tasks": [],
- "average_score": 0.6123769274172342
- },
- "Science": {
- "count": 29,
- "num_samples": 574,
- "tasks": [],
- "average_score": 0.5512015158810595
- },
- "Knowledge": {
- "count": 97,
- "num_samples": 1605,
- "tasks": [],
- "average_score": 0.565796566886933
- },
- "Mathematics": {
- "count": 33,
- "num_samples": 547,
- "tasks": [],
- "average_score": 0.4763267502912362
- }
- }
- },
- "GPT_4o_mini": {
- "skills": {
- "Object Recognition and Classification": {
- "count": 303,
- "num_samples": 4755,
- "tasks": [],
- "average_score": 0.4492982787524939
- },
- "Text Recognition (OCR)": {
- "count": 137,
- "num_samples": 2239,
- "tasks": [],
- "average_score": 0.49026056071002017
- },
- "Language Understanding and Generation": {
- "count": 154,
- "num_samples": 2509,
- "tasks": [],
- "average_score": 0.5168957112681365
- },
- "Scene and Event Understanding": {
- "count": 154,
- "num_samples": 2467,
- "tasks": [],
- "average_score": 0.46731791428406805
- },
- "Mathematical and Logical Reasoning": {
- "count": 109,
- "num_samples": 1910,
- "tasks": [],
- "average_score": 0.3406008235342885
- },
- "Commonsense and Social Reasoning": {
- "count": 51,
- "num_samples": 855,
- "tasks": [],
- "average_score": 0.5572925295284307
- },
- "Ethical and Safety Reasoning": {
- "count": 15,
- "num_samples": 245,
- "tasks": [],
- "average_score": 0.6902380952380953
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 77,
- "num_samples": 1386,
- "tasks": [],
- "average_score": 0.4189154010048976
- },
- "Spatial and Temporal Reasoning": {
- "count": 152,
- "num_samples": 2437,
- "tasks": [],
- "average_score": 0.2943206715105082
- },
- "Planning and Decision Making": {
- "count": 37,
- "num_samples": 577,
- "tasks": [],
- "average_score": 0.19422793560945503
- }
- },
- "input_format": {
- "User Interface Screenshots": {
- "count": 93,
- "num_samples": 1517,
- "tasks": [],
- "average_score": 0.47202628409684394
- },
- "Text-Based Images and Documents": {
- "count": 82,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.3624496929166193
- },
- "Diagrams and Data Visualizations": {
- "count": 101,
- "num_samples": 1718,
- "tasks": [],
- "average_score": 0.38946844562183286
- },
- "Videos": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.45508480503584553
- },
- "Artistic and Creative Content": {
- "count": 32,
- "num_samples": 541,
- "tasks": [],
- "average_score": 0.47569921440672464
- },
- "Photographs": {
- "count": 143,
- "num_samples": 2248,
- "tasks": [],
- "average_score": 0.465175334092545
- },
- "3D Models and Aerial Imagery": {
- "count": 11,
- "num_samples": 169,
- "tasks": [],
- "average_score": 0.29410984789062117
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 98,
- "num_samples": 1514,
- "tasks": [],
- "average_score": 0.41242028190533997
- },
- "structured_output": {
- "count": 110,
- "num_samples": 1714,
- "tasks": [],
- "average_score": 0.3906415365938764
- },
- "exact_text": {
- "count": 83,
- "num_samples": 1278,
- "tasks": [],
- "average_score": 0.44244772638735347
- },
- "numerical_data": {
- "count": 49,
- "num_samples": 862,
- "tasks": [],
- "average_score": 0.3629944944697668
- },
- "open_ended_output": {
- "count": 80,
- "num_samples": 1454,
- "tasks": [],
- "average_score": 0.5713834131825314
- },
- "multiple_choice": {
- "count": 85,
- "num_samples": 1363,
- "tasks": [],
- "average_score": 0.39874839531459466
- }
- },
- "input_num": {
- "6-8 images": {
- "count": 21,
- "num_samples": 314,
- "tasks": [],
- "average_score": 0.3359977324263039
- },
- "9-image or more": {
- "count": 41,
- "num_samples": 623,
- "tasks": [],
- "average_score": 0.4305788513381019
- },
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.46343334374251277
- },
- "video": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.45508480503584553
- },
- "4-5 images": {
- "count": 34,
- "num_samples": 520,
- "tasks": [],
- "average_score": 0.24651576711552803
- },
- "2-3 images": {
- "count": 51,
- "num_samples": 802,
- "tasks": [],
- "average_score": 0.36981497185070983
- }
- },
- "app": {
- "Information_Extraction": {
- "count": 72,
- "num_samples": 1124,
- "tasks": [],
- "average_score": 0.5666618234843734
- },
- "Planning": {
- "count": 78,
- "num_samples": 1239,
- "tasks": [],
- "average_score": 0.2420320329702607
- },
- "Coding": {
- "count": 31,
- "num_samples": 474,
- "tasks": [],
- "average_score": 0.3458483931206892
- },
- "Perception": {
- "count": 145,
- "num_samples": 2313,
- "tasks": [],
- "average_score": 0.43590838051817093
- },
- "Metrics": {
- "count": 20,
- "num_samples": 309,
- "tasks": [],
- "average_score": 0.5176671720617656
- },
- "Science": {
- "count": 29,
- "num_samples": 574,
- "tasks": [],
- "average_score": 0.3554299482098288
- },
- "Knowledge": {
- "count": 97,
- "num_samples": 1605,
- "tasks": [],
- "average_score": 0.5399167524341886
- },
- "Mathematics": {
- "count": 33,
- "num_samples": 547,
- "tasks": [],
- "average_score": 0.32918280841495845
- }
- }
- },
- "Qwen2_VL_72B": {
- "skills": {
- "Object Recognition and Classification": {
- "count": 303,
- "num_samples": 4755,
- "tasks": [],
- "average_score": 0.49787264809826687
- },
- "Text Recognition (OCR)": {
- "count": 137,
- "num_samples": 2239,
- "tasks": [],
- "average_score": 0.5439010430283516
- },
- "Language Understanding and Generation": {
- "count": 154,
- "num_samples": 2509,
- "tasks": [],
- "average_score": 0.5392244859385411
- },
- "Scene and Event Understanding": {
- "count": 154,
- "num_samples": 2467,
- "tasks": [],
- "average_score": 0.509277882172206
- },
- "Mathematical and Logical Reasoning": {
- "count": 109,
- "num_samples": 1910,
- "tasks": [],
- "average_score": 0.3776739609562984
- },
- "Commonsense and Social Reasoning": {
- "count": 51,
- "num_samples": 855,
- "tasks": [],
- "average_score": 0.5676817981386025
- },
- "Ethical and Safety Reasoning": {
- "count": 15,
- "num_samples": 245,
- "tasks": [],
- "average_score": 0.60496992481203
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 77,
- "num_samples": 1386,
- "tasks": [],
- "average_score": 0.4633019068994453
- },
- "Spatial and Temporal Reasoning": {
- "count": 152,
- "num_samples": 2437,
- "tasks": [],
- "average_score": 0.35105970797600183
- },
- "Planning and Decision Making": {
- "count": 37,
- "num_samples": 577,
- "tasks": [],
- "average_score": 0.2201150812944581
- }
- },
- "input_format": {
- "User Interface Screenshots": {
- "count": 93,
- "num_samples": 1517,
- "tasks": [],
- "average_score": 0.5402397677488632
- },
- "Text-Based Images and Documents": {
- "count": 82,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.4289777675393297
- },
- "Diagrams and Data Visualizations": {
- "count": 101,
- "num_samples": 1718,
- "tasks": [],
- "average_score": 0.42094543671351287
- },
- "Videos": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.49943888306036405
- },
- "Artistic and Creative Content": {
- "count": 32,
- "num_samples": 541,
- "tasks": [],
- "average_score": 0.507967430369507
- },
- "Photographs": {
- "count": 143,
- "num_samples": 2248,
- "tasks": [],
- "average_score": 0.49789939867591104
- },
- "3D Models and Aerial Imagery": {
- "count": 11,
- "num_samples": 169,
- "tasks": [],
- "average_score": 0.36212605501536715
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 98,
- "num_samples": 1514,
- "tasks": [],
- "average_score": 0.44719815365440824
- },
- "structured_output": {
- "count": 110,
- "num_samples": 1714,
- "tasks": [],
- "average_score": 0.4500902736468407
- },
- "exact_text": {
- "count": 83,
- "num_samples": 1278,
- "tasks": [],
- "average_score": 0.5098505660529429
- },
- "numerical_data": {
- "count": 49,
- "num_samples": 862,
- "tasks": [],
- "average_score": 0.4027115384266939
- },
- "open_ended_output": {
- "count": 80,
- "num_samples": 1454,
- "tasks": [],
- "average_score": 0.5157810622684265
- },
- "multiple_choice": {
- "count": 85,
- "num_samples": 1363,
- "tasks": [],
- "average_score": 0.5199940976484408
- }
- },
- "input_num": {
- "6-8 images": {
- "count": 21,
- "num_samples": 314,
- "tasks": [],
- "average_score": 0.3100812547241119
- },
- "9-image or more": {
- "count": 41,
- "num_samples": 623,
- "tasks": [],
- "average_score": 0.5468722850464449
- },
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.4918205178721877
- },
- "video": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.49943888306036405
- },
- "4-5 images": {
- "count": 34,
- "num_samples": 520,
- "tasks": [],
- "average_score": 0.36691704884033916
- },
- "2-3 images": {
- "count": 51,
- "num_samples": 802,
- "tasks": [],
- "average_score": 0.45176098055218655
- }
- },
- "app": {
- "Information_Extraction": {
- "count": 72,
- "num_samples": 1124,
- "tasks": [],
- "average_score": 0.5807658773593334
- },
- "Planning": {
- "count": 78,
- "num_samples": 1239,
- "tasks": [],
- "average_score": 0.31245958897213383
- },
- "Coding": {
- "count": 31,
- "num_samples": 474,
- "tasks": [],
- "average_score": 0.4372517645050852
- },
- "Perception": {
- "count": 145,
- "num_samples": 2313,
- "tasks": [],
- "average_score": 0.5362106489630868
- },
- "Metrics": {
- "count": 20,
- "num_samples": 309,
- "tasks": [],
- "average_score": 0.4968249101570037
- },
- "Science": {
- "count": 29,
- "num_samples": 574,
- "tasks": [],
- "average_score": 0.4488852456563113
- },
- "Knowledge": {
- "count": 97,
- "num_samples": 1605,
- "tasks": [],
- "average_score": 0.5166939389651373
- },
- "Mathematics": {
- "count": 33,
- "num_samples": 547,
- "tasks": [],
- "average_score": 0.31157492395100744
- }
- }
- },
- "Qwen2_VL_7B": {
- "skills": {
- "Object Recognition and Classification": {
- "count": 303,
- "num_samples": 4755,
- "tasks": [],
- "average_score": 0.3708368629321668
- },
- "Text Recognition (OCR)": {
- "count": 137,
- "num_samples": 2239,
- "tasks": [],
- "average_score": 0.40213773918065815
- },
- "Language Understanding and Generation": {
- "count": 154,
- "num_samples": 2511,
- "tasks": [],
- "average_score": 0.4034335110538307
- },
- "Scene and Event Understanding": {
- "count": 154,
- "num_samples": 2469,
- "tasks": [],
- "average_score": 0.4109909230944937
- },
- "Mathematical and Logical Reasoning": {
- "count": 109,
- "num_samples": 1910,
- "tasks": [],
- "average_score": 0.2818925976996871
- },
- "Commonsense and Social Reasoning": {
- "count": 51,
- "num_samples": 855,
- "tasks": [],
- "average_score": 0.49360878418945336
- },
- "Ethical and Safety Reasoning": {
- "count": 15,
- "num_samples": 245,
- "tasks": [],
- "average_score": 0.5215889724310777
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 77,
- "num_samples": 1386,
- "tasks": [],
- "average_score": 0.33309401517140946
- },
- "Spatial and Temporal Reasoning": {
- "count": 152,
- "num_samples": 2439,
- "tasks": [],
- "average_score": 0.27564756843599875
- },
- "Planning and Decision Making": {
- "count": 37,
- "num_samples": 577,
- "tasks": [],
- "average_score": 0.1473690605854188
- }
- },
- "input_format": {
- "User Interface Screenshots": {
- "count": 93,
- "num_samples": 1517,
- "tasks": [],
- "average_score": 0.3821046882337143
- },
- "Text-Based Images and Documents": {
- "count": 82,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.2896392967775049
- },
- "Diagrams and Data Visualizations": {
- "count": 101,
- "num_samples": 1718,
- "tasks": [],
- "average_score": 0.3223325179806271
- },
- "Videos": {
- "count": 43,
- "num_samples": 700,
- "tasks": [],
- "average_score": 0.4111189310485516
- },
- "Artistic and Creative Content": {
- "count": 32,
- "num_samples": 541,
- "tasks": [],
- "average_score": 0.34825121621909577
- },
- "Photographs": {
- "count": 143,
- "num_samples": 2248,
- "tasks": [],
- "average_score": 0.40660144920567376
- },
- "3D Models and Aerial Imagery": {
- "count": 11,
- "num_samples": 169,
- "tasks": [],
- "average_score": 0.262166593895899
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 98,
- "num_samples": 1514,
- "tasks": [],
- "average_score": 0.3430730210869785
- },
- "structured_output": {
- "count": 110,
- "num_samples": 1714,
- "tasks": [],
- "average_score": 0.3426196933687219
- },
- "exact_text": {
- "count": 83,
- "num_samples": 1278,
- "tasks": [],
- "average_score": 0.35162604166912687
- },
- "numerical_data": {
- "count": 49,
- "num_samples": 862,
- "tasks": [],
- "average_score": 0.32665673520415817
- },
- "open_ended_output": {
- "count": 80,
- "num_samples": 1456,
- "tasks": [],
- "average_score": 0.3909745200389741
- },
- "multiple_choice": {
- "count": 85,
- "num_samples": 1363,
- "tasks": [],
- "average_score": 0.39898011714302023
- }
- },
- "input_num": {
- "6-8 images": {
- "count": 21,
- "num_samples": 314,
- "tasks": [],
- "average_score": 0.19415154950869234
- },
- "9-image or more": {
- "count": 41,
- "num_samples": 623,
- "tasks": [],
- "average_score": 0.37453319457428763
- },
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.37701588079136955
- },
- "video": {
- "count": 43,
- "num_samples": 700,
- "tasks": [],
- "average_score": 0.4111189310485516
- },
- "4-5 images": {
- "count": 34,
- "num_samples": 520,
- "tasks": [],
- "average_score": 0.26429868057315387
- },
- "2-3 images": {
- "count": 51,
- "num_samples": 802,
- "tasks": [],
- "average_score": 0.33008667136891007
- }
- },
- "app": {
- "Information_Extraction": {
- "count": 72,
- "num_samples": 1124,
- "tasks": [],
- "average_score": 0.42746758545520747
- },
- "Planning": {
- "count": 78,
- "num_samples": 1239,
- "tasks": [],
- "average_score": 0.2003871750665659
- },
- "Coding": {
- "count": 31,
- "num_samples": 474,
- "tasks": [],
- "average_score": 0.3270187644950453
- },
- "Perception": {
- "count": 145,
- "num_samples": 2315,
- "tasks": [],
- "average_score": 0.40048749993497734
- },
- "Metrics": {
- "count": 20,
- "num_samples": 309,
- "tasks": [],
- "average_score": 0.4245693009859056
- },
- "Science": {
- "count": 29,
- "num_samples": 574,
- "tasks": [],
- "average_score": 0.29880557491654197
- },
- "Knowledge": {
- "count": 97,
- "num_samples": 1605,
- "tasks": [],
- "average_score": 0.4276637093173368
- },
- "Mathematics": {
- "count": 33,
- "num_samples": 547,
- "tasks": [],
- "average_score": 0.25562039051316643
- }
- }
- },
- "llava_onevision_72B": {
- "skills": {
- "Object Recognition and Classification": {
- "count": 303,
- "num_samples": 4755,
- "tasks": [],
- "average_score": 0.3615741356043519
- },
- "Text Recognition (OCR)": {
- "count": 137,
- "num_samples": 2239,
- "tasks": [],
- "average_score": 0.2834675874668524
- },
- "Language Understanding and Generation": {
- "count": 154,
- "num_samples": 2509,
- "tasks": [],
- "average_score": 0.3674817002808495
- },
- "Scene and Event Understanding": {
- "count": 154,
- "num_samples": 2467,
- "tasks": [],
- "average_score": 0.42146038539739283
- },
- "Mathematical and Logical Reasoning": {
- "count": 109,
- "num_samples": 1910,
- "tasks": [],
- "average_score": 0.2951434804409883
- },
- "Commonsense and Social Reasoning": {
- "count": 51,
- "num_samples": 855,
- "tasks": [],
- "average_score": 0.478119286755779
- },
- "Ethical and Safety Reasoning": {
- "count": 15,
- "num_samples": 245,
- "tasks": [],
- "average_score": 0.6005438596491229
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 77,
- "num_samples": 1386,
- "tasks": [],
- "average_score": 0.31663222188988865
- },
- "Spatial and Temporal Reasoning": {
- "count": 152,
- "num_samples": 2437,
- "tasks": [],
- "average_score": 0.29633645022129285
- },
- "Planning and Decision Making": {
- "count": 37,
- "num_samples": 577,
- "tasks": [],
- "average_score": 0.13872280436872364
- }
- },
- "input_format": {
- "User Interface Screenshots": {
- "count": 93,
- "num_samples": 1517,
- "tasks": [],
- "average_score": 0.23380046931752074
- },
- "Text-Based Images and Documents": {
- "count": 82,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.2126914943750874
- },
- "Diagrams and Data Visualizations": {
- "count": 101,
- "num_samples": 1718,
- "tasks": [],
- "average_score": 0.34566020099204997
- },
- "Videos": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.4446001874842145
- },
- "Artistic and Creative Content": {
- "count": 32,
- "num_samples": 541,
- "tasks": [],
- "average_score": 0.4401364830377099
- },
- "Photographs": {
- "count": 143,
- "num_samples": 2248,
- "tasks": [],
- "average_score": 0.4247591719013819
- },
- "3D Models and Aerial Imagery": {
- "count": 11,
- "num_samples": 169,
- "tasks": [],
- "average_score": 0.23897262553543516
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 98,
- "num_samples": 1514,
- "tasks": [],
- "average_score": 0.2868275930712835
- },
- "structured_output": {
- "count": 110,
- "num_samples": 1714,
- "tasks": [],
- "average_score": 0.259450238500612
- },
- "exact_text": {
- "count": 83,
- "num_samples": 1278,
- "tasks": [],
- "average_score": 0.370724080249463
- },
- "numerical_data": {
- "count": 49,
- "num_samples": 862,
- "tasks": [],
- "average_score": 0.3065719940769206
- },
- "open_ended_output": {
- "count": 80,
- "num_samples": 1454,
- "tasks": [],
- "average_score": 0.4293132525502993
- },
- "multiple_choice": {
- "count": 85,
- "num_samples": 1363,
- "tasks": [],
- "average_score": 0.3986052416087927
- }
- },
- "input_num": {
- "6-8 images": {
- "count": 21,
- "num_samples": 314,
- "tasks": [],
- "average_score": 0.20730347694633405
- },
- "9-image or more": {
- "count": 41,
- "num_samples": 623,
- "tasks": [],
- "average_score": 0.28104747671521785
- },
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.34840850032295206
- },
- "video": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.4446001874842145
- },
- "4-5 images": {
- "count": 34,
- "num_samples": 520,
- "tasks": [],
- "average_score": 0.25013213032747944
- },
- "2-3 images": {
- "count": 51,
- "num_samples": 802,
- "tasks": [],
- "average_score": 0.34156793747875674
- }
- },
- "app": {
- "Information_Extraction": {
- "count": 72,
- "num_samples": 1124,
- "tasks": [],
- "average_score": 0.3076421844825067
- },
- "Planning": {
- "count": 78,
- "num_samples": 1239,
- "tasks": [],
- "average_score": 0.18168666652660437
- },
- "Coding": {
- "count": 31,
- "num_samples": 474,
- "tasks": [],
- "average_score": 0.23240790940031927
- },
- "Perception": {
- "count": 145,
- "num_samples": 2313,
- "tasks": [],
- "average_score": 0.38362780453378204
- },
- "Metrics": {
- "count": 20,
- "num_samples": 309,
- "tasks": [],
- "average_score": 0.4807891958712894
- },
- "Science": {
- "count": 29,
- "num_samples": 574,
- "tasks": [],
- "average_score": 0.31702495228966576
- },
- "Knowledge": {
- "count": 97,
- "num_samples": 1605,
- "tasks": [],
- "average_score": 0.4358874880224115
- },
- "Mathematics": {
- "count": 33,
- "num_samples": 547,
- "tasks": [],
- "average_score": 0.31588468105075895
- }
- }
- },
- "llava_onevision_7B": {
- "skills": {
- "Object Recognition and Classification": {
- "count": 303,
- "num_samples": 4755,
- "tasks": [],
- "average_score": 0.2524786809911341
- },
- "Text Recognition (OCR)": {
- "count": 137,
- "num_samples": 2239,
- "tasks": [],
- "average_score": 0.19077168655703208
- },
- "Language Understanding and Generation": {
- "count": 154,
- "num_samples": 2509,
- "tasks": [],
- "average_score": 0.2555444562659206
- },
- "Scene and Event Understanding": {
- "count": 154,
- "num_samples": 2467,
- "tasks": [],
- "average_score": 0.29981286990552625
- },
- "Mathematical and Logical Reasoning": {
- "count": 109,
- "num_samples": 1910,
- "tasks": [],
- "average_score": 0.18973491465938852
- },
- "Commonsense and Social Reasoning": {
- "count": 51,
- "num_samples": 855,
- "tasks": [],
- "average_score": 0.36842322314565323
- },
- "Ethical and Safety Reasoning": {
- "count": 15,
- "num_samples": 245,
- "tasks": [],
- "average_score": 0.44998746867167916
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 77,
- "num_samples": 1386,
- "tasks": [],
- "average_score": 0.2445135206648208
- },
- "Spatial and Temporal Reasoning": {
- "count": 152,
- "num_samples": 2437,
- "tasks": [],
- "average_score": 0.21802943568344288
- },
- "Planning and Decision Making": {
- "count": 37,
- "num_samples": 577,
- "tasks": [],
- "average_score": 0.06658775725427067
- }
- },
- "input_format": {
- "User Interface Screenshots": {
- "count": 93,
- "num_samples": 1517,
- "tasks": [],
- "average_score": 0.1466861610319767
- },
- "Text-Based Images and Documents": {
- "count": 82,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.13297395577964055
- },
- "Diagrams and Data Visualizations": {
- "count": 101,
- "num_samples": 1718,
- "tasks": [],
- "average_score": 0.24236719143449742
- },
- "Videos": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.30985943541023103
- },
- "Artistic and Creative Content": {
- "count": 32,
- "num_samples": 541,
- "tasks": [],
- "average_score": 0.3199731020402028
- },
- "Photographs": {
- "count": 143,
- "num_samples": 2248,
- "tasks": [],
- "average_score": 0.3263378734842879
- },
- "3D Models and Aerial Imagery": {
- "count": 11,
- "num_samples": 169,
- "tasks": [],
- "average_score": 0.13043163858789789
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 98,
- "num_samples": 1514,
- "tasks": [],
- "average_score": 0.20277804188944173
- },
- "structured_output": {
- "count": 110,
- "num_samples": 1714,
- "tasks": [],
- "average_score": 0.18291595756285564
- },
- "exact_text": {
- "count": 83,
- "num_samples": 1278,
- "tasks": [],
- "average_score": 0.25384794412815426
- },
- "numerical_data": {
- "count": 49,
- "num_samples": 862,
- "tasks": [],
- "average_score": 0.2200472229099345
- },
- "open_ended_output": {
- "count": 80,
- "num_samples": 1454,
- "tasks": [],
- "average_score": 0.3127341248874411
- },
- "multiple_choice": {
- "count": 85,
- "num_samples": 1363,
- "tasks": [],
- "average_score": 0.2802999516721972
- }
- },
- "input_num": {
- "6-8 images": {
- "count": 21,
- "num_samples": 314,
- "tasks": [],
- "average_score": 0.1476473922902494
- },
- "9-image or more": {
- "count": 41,
- "num_samples": 623,
- "tasks": [],
- "average_score": 0.13803800801858385
- },
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.2548084764084038
- },
- "video": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.30985943541023103
- },
- "4-5 images": {
- "count": 34,
- "num_samples": 520,
- "tasks": [],
- "average_score": 0.1778991941079372
- },
- "2-3 images": {
- "count": 51,
- "num_samples": 802,
- "tasks": [],
- "average_score": 0.2410111891690358
- }
- },
- "app": {
- "Information_Extraction": {
- "count": 72,
- "num_samples": 1124,
- "tasks": [],
- "average_score": 0.19283211154717242
- },
- "Planning": {
- "count": 78,
- "num_samples": 1239,
- "tasks": [],
- "average_score": 0.09846926279075068
- },
- "Coding": {
- "count": 31,
- "num_samples": 474,
- "tasks": [],
- "average_score": 0.15189414475467605
- },
- "Perception": {
- "count": 145,
- "num_samples": 2313,
- "tasks": [],
- "average_score": 0.28505205882578405
- },
- "Metrics": {
- "count": 20,
- "num_samples": 309,
- "tasks": [],
- "average_score": 0.3600079950628582
- },
- "Science": {
- "count": 29,
- "num_samples": 574,
- "tasks": [],
- "average_score": 0.23654776813656775
- },
- "Knowledge": {
- "count": 97,
- "num_samples": 1605,
- "tasks": [],
- "average_score": 0.3271805711561501
- },
- "Mathematics": {
- "count": 33,
- "num_samples": 547,
- "tasks": [],
- "average_score": 0.22080546908673507
- }
- }
- },
- "InternVL2_76B": {
- "skills": {
- "Object Recognition and Classification": {
- "count": 303,
- "num_samples": 4755,
- "tasks": [],
- "average_score": 0.38193012983650343
- },
- "Text Recognition (OCR)": {
- "count": 137,
- "num_samples": 2239,
- "tasks": [],
- "average_score": 0.41315219763443384
- },
- "Language Understanding and Generation": {
- "count": 154,
- "num_samples": 2509,
- "tasks": [],
- "average_score": 0.43665980552577693
- },
- "Scene and Event Understanding": {
- "count": 154,
- "num_samples": 2467,
- "tasks": [],
- "average_score": 0.4265623936500962
- },
- "Mathematical and Logical Reasoning": {
- "count": 109,
- "num_samples": 1910,
- "tasks": [],
- "average_score": 0.2975890791763991
- },
- "Commonsense and Social Reasoning": {
- "count": 51,
- "num_samples": 855,
- "tasks": [],
- "average_score": 0.5257990949897898
- },
- "Ethical and Safety Reasoning": {
- "count": 15,
- "num_samples": 245,
- "tasks": [],
- "average_score": 0.5779473684210527
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 77,
- "num_samples": 1386,
- "tasks": [],
- "average_score": 0.33287081421166276
- },
- "Spatial and Temporal Reasoning": {
- "count": 152,
- "num_samples": 2437,
- "tasks": [],
- "average_score": 0.2949505390920417
- },
- "Planning and Decision Making": {
- "count": 37,
- "num_samples": 577,
- "tasks": [],
- "average_score": 0.17036496432397477
- }
- },
- "input_format": {
- "User Interface Screenshots": {
- "count": 93,
- "num_samples": 1517,
- "tasks": [],
- "average_score": 0.3634339625985008
- },
- "Text-Based Images and Documents": {
- "count": 82,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.31396468806559114
- },
- "Diagrams and Data Visualizations": {
- "count": 101,
- "num_samples": 1718,
- "tasks": [],
- "average_score": 0.3473756113126343
- },
- "Videos": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.395893002855977
- },
- "Artistic and Creative Content": {
- "count": 32,
- "num_samples": 541,
- "tasks": [],
- "average_score": 0.44982107744035305
- },
- "Photographs": {
- "count": 143,
- "num_samples": 2248,
- "tasks": [],
- "average_score": 0.42875248733027654
- },
- "3D Models and Aerial Imagery": {
- "count": 11,
- "num_samples": 169,
- "tasks": [],
- "average_score": 0.2868239162778749
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 98,
- "num_samples": 1514,
- "tasks": [],
- "average_score": 0.3630499545707523
- },
- "structured_output": {
- "count": 110,
- "num_samples": 1714,
- "tasks": [],
- "average_score": 0.3476691827105281
- },
- "exact_text": {
- "count": 83,
- "num_samples": 1278,
- "tasks": [],
- "average_score": 0.3943337471922549
- },
- "numerical_data": {
- "count": 49,
- "num_samples": 862,
- "tasks": [],
- "average_score": 0.29244088978470345
- },
- "open_ended_output": {
- "count": 80,
- "num_samples": 1454,
- "tasks": [],
- "average_score": 0.45822072478616577
- },
- "multiple_choice": {
- "count": 85,
- "num_samples": 1363,
- "tasks": [],
- "average_score": 0.3879326330400817
- }
- },
- "input_num": {
- "6-8 images": {
- "count": 21,
- "num_samples": 314,
- "tasks": [],
- "average_score": 0.20309901738473166
- },
- "9-image or more": {
- "count": 41,
- "num_samples": 623,
- "tasks": [],
- "average_score": 0.34771123515123364
- },
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.4145693044465943
- },
- "video": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.395893002855977
- },
- "4-5 images": {
- "count": 34,
- "num_samples": 520,
- "tasks": [],
- "average_score": 0.24403942809507134
- },
- "2-3 images": {
- "count": 51,
- "num_samples": 802,
- "tasks": [],
- "average_score": 0.3153417935059416
- }
- },
- "app": {
- "Information_Extraction": {
- "count": 72,
- "num_samples": 1124,
- "tasks": [],
- "average_score": 0.4306947454508794
- },
- "Planning": {
- "count": 78,
- "num_samples": 1239,
- "tasks": [],
- "average_score": 0.2132321995754061
- },
- "Coding": {
- "count": 31,
- "num_samples": 474,
- "tasks": [],
- "average_score": 0.2953329718984368
- },
- "Perception": {
- "count": 145,
- "num_samples": 2313,
- "tasks": [],
- "average_score": 0.42202934355552685
- },
- "Metrics": {
- "count": 20,
- "num_samples": 309,
- "tasks": [],
- "average_score": 0.47409276729986083
- },
- "Science": {
- "count": 29,
- "num_samples": 574,
- "tasks": [],
- "average_score": 0.30014798153766264
- },
- "Knowledge": {
- "count": 97,
- "num_samples": 1605,
- "tasks": [],
- "average_score": 0.4625649385962016
- },
- "Mathematics": {
- "count": 33,
- "num_samples": 547,
- "tasks": [],
- "average_score": 0.2868813944130515
- }
- }
- },
- "InternVL2_8B": {
- "skills": {
- "Object Recognition and Classification": {
- "count": 303,
- "num_samples": 4755,
- "tasks": [],
- "average_score": 0.2817247716997634
- },
- "Text Recognition (OCR)": {
- "count": 137,
- "num_samples": 2239,
- "tasks": [],
- "average_score": 0.280559214034858
- },
- "Language Understanding and Generation": {
- "count": 154,
- "num_samples": 2511,
- "tasks": [],
- "average_score": 0.32020728060179815
- },
- "Scene and Event Understanding": {
- "count": 154,
- "num_samples": 2469,
- "tasks": [],
- "average_score": 0.325593535916075
- },
- "Mathematical and Logical Reasoning": {
- "count": 109,
- "num_samples": 1910,
- "tasks": [],
- "average_score": 0.24118253695139918
- },
- "Commonsense and Social Reasoning": {
- "count": 51,
- "num_samples": 855,
- "tasks": [],
- "average_score": 0.39684007367798446
- },
- "Ethical and Safety Reasoning": {
- "count": 15,
- "num_samples": 245,
- "tasks": [],
- "average_score": 0.4700852130325815
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 77,
- "num_samples": 1386,
- "tasks": [],
- "average_score": 0.27052668526005397
- },
- "Spatial and Temporal Reasoning": {
- "count": 152,
- "num_samples": 2439,
- "tasks": [],
- "average_score": 0.23189345356483618
- },
- "Planning and Decision Making": {
- "count": 37,
- "num_samples": 577,
- "tasks": [],
- "average_score": 0.08260405712900723
- }
- },
- "input_format": {
- "User Interface Screenshots": {
- "count": 93,
- "num_samples": 1517,
- "tasks": [],
- "average_score": 0.22800928556370195
- },
- "Text-Based Images and Documents": {
- "count": 82,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.2013779290163996
- },
- "Diagrams and Data Visualizations": {
- "count": 101,
- "num_samples": 1718,
- "tasks": [],
- "average_score": 0.2804429603269583
- },
- "Videos": {
- "count": 43,
- "num_samples": 700,
- "tasks": [],
- "average_score": 0.34791358240562653
- },
- "Artistic and Creative Content": {
- "count": 32,
- "num_samples": 541,
- "tasks": [],
- "average_score": 0.2942163420306113
- },
- "Photographs": {
- "count": 143,
- "num_samples": 2248,
- "tasks": [],
- "average_score": 0.3388056726588417
- },
- "3D Models and Aerial Imagery": {
- "count": 11,
- "num_samples": 169,
- "tasks": [],
- "average_score": 0.10933317885944857
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 98,
- "num_samples": 1514,
- "tasks": [],
- "average_score": 0.250804626773504
- },
- "structured_output": {
- "count": 110,
- "num_samples": 1714,
- "tasks": [],
- "average_score": 0.2522493284864019
- },
- "exact_text": {
- "count": 83,
- "num_samples": 1278,
- "tasks": [],
- "average_score": 0.27414636444623874
- },
- "numerical_data": {
- "count": 49,
- "num_samples": 862,
- "tasks": [],
- "average_score": 0.22381302045502052
- },
- "open_ended_output": {
- "count": 80,
- "num_samples": 1456,
- "tasks": [],
- "average_score": 0.3537549824897016
- },
- "multiple_choice": {
- "count": 85,
- "num_samples": 1363,
- "tasks": [],
- "average_score": 0.30261189962428353
- }
- },
- "input_num": {
- "6-8 images": {
- "count": 21,
- "num_samples": 314,
- "tasks": [],
- "average_score": 0.15434618291761149
- },
- "9-image or more": {
- "count": 41,
- "num_samples": 623,
- "tasks": [],
- "average_score": 0.19872104324302098
- },
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.30088711082969344
- },
- "video": {
- "count": 43,
- "num_samples": 700,
- "tasks": [],
- "average_score": 0.34791358240562653
- },
- "4-5 images": {
- "count": 34,
- "num_samples": 520,
- "tasks": [],
- "average_score": 0.17725087609332119
- },
- "2-3 images": {
- "count": 51,
- "num_samples": 802,
- "tasks": [],
- "average_score": 0.2532272454839157
- }
- },
- "app": {
- "Information_Extraction": {
- "count": 72,
- "num_samples": 1124,
- "tasks": [],
- "average_score": 0.29129840423784176
- },
- "Planning": {
- "count": 78,
- "num_samples": 1239,
- "tasks": [],
- "average_score": 0.12166926715781588
- },
- "Coding": {
- "count": 31,
- "num_samples": 474,
- "tasks": [],
- "average_score": 0.24700310231619527
- },
- "Perception": {
- "count": 145,
- "num_samples": 2315,
- "tasks": [],
- "average_score": 0.3214666523378005
- },
- "Metrics": {
- "count": 20,
- "num_samples": 309,
- "tasks": [],
- "average_score": 0.3995660275981844
- },
- "Science": {
- "count": 29,
- "num_samples": 574,
- "tasks": [],
- "average_score": 0.24614711281861912
- },
- "Knowledge": {
- "count": 97,
- "num_samples": 1605,
- "tasks": [],
- "average_score": 0.3393895915929317
- },
- "Mathematics": {
- "count": 33,
- "num_samples": 547,
- "tasks": [],
- "average_score": 0.22078333222564453
- }
- }
- },
- "MiniCPM_v2.6": {
- "skills": {
- "Object Recognition and Classification": {
- "count": 303,
- "num_samples": 4755,
- "tasks": [],
- "average_score": 0.2604967101191775
- },
- "Text Recognition (OCR)": {
- "count": 137,
- "num_samples": 2239,
- "tasks": [],
- "average_score": 0.2500331562865158
- },
- "Language Understanding and Generation": {
- "count": 154,
- "num_samples": 2509,
- "tasks": [],
- "average_score": 0.3003169369011028
- },
- "Scene and Event Understanding": {
- "count": 154,
- "num_samples": 2467,
- "tasks": [],
- "average_score": 0.31808748114668184
- },
- "Mathematical and Logical Reasoning": {
- "count": 109,
- "num_samples": 1910,
- "tasks": [],
- "average_score": 0.18281637763548025
- },
- "Commonsense and Social Reasoning": {
- "count": 51,
- "num_samples": 855,
- "tasks": [],
- "average_score": 0.40732197204308807
- },
- "Ethical and Safety Reasoning": {
- "count": 15,
- "num_samples": 245,
- "tasks": [],
- "average_score": 0.48798245614035085
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 77,
- "num_samples": 1386,
- "tasks": [],
- "average_score": 0.23723675736151562
- },
- "Spatial and Temporal Reasoning": {
- "count": 152,
- "num_samples": 2437,
- "tasks": [],
- "average_score": 0.1968926733821904
- },
- "Planning and Decision Making": {
- "count": 37,
- "num_samples": 577,
- "tasks": [],
- "average_score": 0.08735883237069725
- }
- },
- "input_format": {
- "User Interface Screenshots": {
- "count": 93,
- "num_samples": 1517,
- "tasks": [],
- "average_score": 0.21195711598986072
- },
- "Text-Based Images and Documents": {
- "count": 82,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.18639148159043903
- },
- "Diagrams and Data Visualizations": {
- "count": 101,
- "num_samples": 1718,
- "tasks": [],
- "average_score": 0.21578309681746147
- },
- "Videos": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.3527537836840162
- },
- "Artistic and Creative Content": {
- "count": 32,
- "num_samples": 541,
- "tasks": [],
- "average_score": 0.3096882575625531
- },
- "Photographs": {
- "count": 143,
- "num_samples": 2248,
- "tasks": [],
- "average_score": 0.3176880312524649
- },
- "3D Models and Aerial Imagery": {
- "count": 11,
- "num_samples": 169,
- "tasks": [],
- "average_score": 0.0755920550038197
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 98,
- "num_samples": 1514,
- "tasks": [],
- "average_score": 0.23506388020592064
- },
- "structured_output": {
- "count": 110,
- "num_samples": 1714,
- "tasks": [],
- "average_score": 0.1781127776443048
- },
- "exact_text": {
- "count": 83,
- "num_samples": 1278,
- "tasks": [],
- "average_score": 0.2551275278138797
- },
- "numerical_data": {
- "count": 49,
- "num_samples": 862,
- "tasks": [],
- "average_score": 0.20833171754655547
- },
- "open_ended_output": {
- "count": 80,
- "num_samples": 1454,
- "tasks": [],
- "average_score": 0.36473950920880716
- },
- "multiple_choice": {
- "count": 85,
- "num_samples": 1363,
- "tasks": [],
- "average_score": 0.293386806641223
- }
- },
- "input_num": {
- "6-8 images": {
- "count": 21,
- "num_samples": 314,
- "tasks": [],
- "average_score": 0.13955971277399848
- },
- "9-image or more": {
- "count": 41,
- "num_samples": 623,
- "tasks": [],
- "average_score": 0.23596215721092323
- },
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.26319603880798287
- },
- "video": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.3527537836840162
- },
- "4-5 images": {
- "count": 34,
- "num_samples": 520,
- "tasks": [],
- "average_score": 0.17888270664238365
- },
- "2-3 images": {
- "count": 51,
- "num_samples": 802,
- "tasks": [],
- "average_score": 0.22288558250834017
- }
- },
- "app": {
- "Information_Extraction": {
- "count": 72,
- "num_samples": 1124,
- "tasks": [],
- "average_score": 0.2666989364424082
- },
- "Planning": {
- "count": 78,
- "num_samples": 1239,
- "tasks": [],
- "average_score": 0.11693267119342445
- },
- "Coding": {
- "count": 31,
- "num_samples": 474,
- "tasks": [],
- "average_score": 0.15342045420318667
- },
- "Perception": {
- "count": 145,
- "num_samples": 2313,
- "tasks": [],
- "average_score": 0.29243044121840894
- },
- "Metrics": {
- "count": 20,
- "num_samples": 309,
- "tasks": [],
- "average_score": 0.3777897246686755
- },
- "Science": {
- "count": 29,
- "num_samples": 574,
- "tasks": [],
- "average_score": 0.25714862989687987
- },
- "Knowledge": {
- "count": 97,
- "num_samples": 1605,
- "tasks": [],
- "average_score": 0.33187729423141027
- },
- "Mathematics": {
- "count": 33,
- "num_samples": 547,
- "tasks": [],
- "average_score": 0.16493399805627715
- }
- }
- },
- "Phi-3.5-vision": {
- "skills": {
- "Object Recognition and Classification": {
- "count": 303,
- "num_samples": 4755,
- "tasks": [],
- "average_score": 0.2551037902226636
- },
- "Text Recognition (OCR)": {
- "count": 137,
- "num_samples": 2239,
- "tasks": [],
- "average_score": 0.2483252111012436
- },
- "Language Understanding and Generation": {
- "count": 154,
- "num_samples": 2509,
- "tasks": [],
- "average_score": 0.28732942108098564
- },
- "Scene and Event Understanding": {
- "count": 154,
- "num_samples": 2467,
- "tasks": [],
- "average_score": 0.3049602749093698
- },
- "Mathematical and Logical Reasoning": {
- "count": 109,
- "num_samples": 1910,
- "tasks": [],
- "average_score": 0.21653804346780042
- },
- "Commonsense and Social Reasoning": {
- "count": 51,
- "num_samples": 855,
- "tasks": [],
- "average_score": 0.36823084724842464
- },
- "Ethical and Safety Reasoning": {
- "count": 15,
- "num_samples": 245,
- "tasks": [],
- "average_score": 0.46663157894736845
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 77,
- "num_samples": 1386,
- "tasks": [],
- "average_score": 0.24145330077248778
- },
- "Spatial and Temporal Reasoning": {
- "count": 152,
- "num_samples": 2437,
- "tasks": [],
- "average_score": 0.2154692063816354
- },
- "Planning and Decision Making": {
- "count": 37,
- "num_samples": 577,
- "tasks": [],
- "average_score": 0.08944481289041872
- }
- },
- "input_format": {
- "User Interface Screenshots": {
- "count": 93,
- "num_samples": 1517,
- "tasks": [],
- "average_score": 0.1865974025588298
- },
- "Text-Based Images and Documents": {
- "count": 82,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.17497379027990792
- },
- "Diagrams and Data Visualizations": {
- "count": 101,
- "num_samples": 1718,
- "tasks": [],
- "average_score": 0.26053460127801603
- },
- "Videos": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.24669318645450836
- },
- "Artistic and Creative Content": {
- "count": 32,
- "num_samples": 541,
- "tasks": [],
- "average_score": 0.2786226802221388
- },
- "Photographs": {
- "count": 143,
- "num_samples": 2248,
- "tasks": [],
- "average_score": 0.3413768635559215
- },
- "3D Models and Aerial Imagery": {
- "count": 11,
- "num_samples": 169,
- "tasks": [],
- "average_score": 0.15444746077692828
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 98,
- "num_samples": 1514,
- "tasks": [],
- "average_score": 0.2177924712685756
- },
- "structured_output": {
- "count": 110,
- "num_samples": 1714,
- "tasks": [],
- "average_score": 0.21443984349574025
- },
- "exact_text": {
- "count": 83,
- "num_samples": 1278,
- "tasks": [],
- "average_score": 0.2572371188897671
- },
- "numerical_data": {
- "count": 49,
- "num_samples": 862,
- "tasks": [],
- "average_score": 0.21409351002477045
- },
- "open_ended_output": {
- "count": 80,
- "num_samples": 1454,
- "tasks": [],
- "average_score": 0.365192668303297
- },
- "multiple_choice": {
- "count": 85,
- "num_samples": 1363,
- "tasks": [],
- "average_score": 0.25960269434727634
- }
- },
- "input_num": {
- "6-8 images": {
- "count": 21,
- "num_samples": 314,
- "tasks": [],
- "average_score": 0.12546296296296297
- },
- "9-image or more": {
- "count": 41,
- "num_samples": 623,
- "tasks": [],
- "average_score": 0.14337869666229008
- },
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.27790147494714373
- },
- "video": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.24669318645450836
- },
- "4-5 images": {
- "count": 34,
- "num_samples": 520,
- "tasks": [],
- "average_score": 0.20168001345379397
- },
- "2-3 images": {
- "count": 51,
- "num_samples": 802,
- "tasks": [],
- "average_score": 0.2850550871176333
- }
- },
- "app": {
- "Information_Extraction": {
- "count": 72,
- "num_samples": 1124,
- "tasks": [],
- "average_score": 0.2237087834389946
- },
- "Planning": {
- "count": 78,
- "num_samples": 1239,
- "tasks": [],
- "average_score": 0.08928724806836039
- },
- "Coding": {
- "count": 31,
- "num_samples": 474,
- "tasks": [],
- "average_score": 0.219367263034246
- },
- "Perception": {
- "count": 145,
- "num_samples": 2313,
- "tasks": [],
- "average_score": 0.316318567258608
- },
- "Metrics": {
- "count": 20,
- "num_samples": 309,
- "tasks": [],
- "average_score": 0.3945898792928062
- },
- "Science": {
- "count": 29,
- "num_samples": 574,
- "tasks": [],
- "average_score": 0.21925278489551242
- },
- "Knowledge": {
- "count": 97,
- "num_samples": 1605,
- "tasks": [],
- "average_score": 0.33264696401038385
- },
- "Mathematics": {
- "count": 33,
- "num_samples": 547,
- "tasks": [],
- "average_score": 0.17575913004138646
- }
- }
- },
- "Pixtral_12B": {
- "skills": {
- "Object Recognition and Classification": {
- "count": 303,
- "num_samples": 4755,
- "tasks": [],
- "average_score": 0.3460288961410444
- },
- "Text Recognition (OCR)": {
- "count": 137,
- "num_samples": 2239,
- "tasks": [],
- "average_score": 0.3777640755922415
- },
- "Language Understanding and Generation": {
- "count": 154,
- "num_samples": 2509,
- "tasks": [],
- "average_score": 0.38299418297106824
- },
- "Scene and Event Understanding": {
- "count": 154,
- "num_samples": 2467,
- "tasks": [],
- "average_score": 0.3776722463473817
- },
- "Mathematical and Logical Reasoning": {
- "count": 109,
- "num_samples": 1910,
- "tasks": [],
- "average_score": 0.2828575553466608
- },
- "Commonsense and Social Reasoning": {
- "count": 51,
- "num_samples": 855,
- "tasks": [],
- "average_score": 0.419071767659191
- },
- "Ethical and Safety Reasoning": {
- "count": 15,
- "num_samples": 245,
- "tasks": [],
- "average_score": 0.5687919799498747
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 77,
- "num_samples": 1386,
- "tasks": [],
- "average_score": 0.32813540763467464
- },
- "Spatial and Temporal Reasoning": {
- "count": 152,
- "num_samples": 2437,
- "tasks": [],
- "average_score": 0.2677293131171651
- },
- "Planning and Decision Making": {
- "count": 37,
- "num_samples": 577,
- "tasks": [],
- "average_score": 0.10591240329992047
- }
- },
- "input_format": {
- "User Interface Screenshots": {
- "count": 93,
- "num_samples": 1517,
- "tasks": [],
- "average_score": 0.3070067338940785
- },
- "Text-Based Images and Documents": {
- "count": 82,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.28832738144368647
- },
- "Diagrams and Data Visualizations": {
- "count": 101,
- "num_samples": 1718,
- "tasks": [],
- "average_score": 0.3223299098375932
- },
- "Videos": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.409643099998057
- },
- "Artistic and Creative Content": {
- "count": 32,
- "num_samples": 541,
- "tasks": [],
- "average_score": 0.37450808136321684
- },
- "Photographs": {
- "count": 143,
- "num_samples": 2248,
- "tasks": [],
- "average_score": 0.37115973962368864
- },
- "3D Models and Aerial Imagery": {
- "count": 11,
- "num_samples": 169,
- "tasks": [],
- "average_score": 0.24009431093278263
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 98,
- "num_samples": 1514,
- "tasks": [],
- "average_score": 0.3078181788009137
- },
- "structured_output": {
- "count": 110,
- "num_samples": 1714,
- "tasks": [],
- "average_score": 0.3188475653127356
- },
- "exact_text": {
- "count": 83,
- "num_samples": 1278,
- "tasks": [],
- "average_score": 0.3639544140938305
- },
- "numerical_data": {
- "count": 49,
- "num_samples": 862,
- "tasks": [],
- "average_score": 0.32073418701669026
- },
- "open_ended_output": {
- "count": 80,
- "num_samples": 1454,
- "tasks": [],
- "average_score": 0.4166613092238043
- },
- "multiple_choice": {
- "count": 85,
- "num_samples": 1363,
- "tasks": [],
- "average_score": 0.3008126415966517
- }
- },
- "input_num": {
- "6-8 images": {
- "count": 21,
- "num_samples": 314,
- "tasks": [],
- "average_score": 0.19743008314436883
- },
- "9-image or more": {
- "count": 41,
- "num_samples": 623,
- "tasks": [],
- "average_score": 0.16642294307267227
- },
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.37108130557306335
- },
- "video": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.409643099998057
- },
- "4-5 images": {
- "count": 34,
- "num_samples": 520,
- "tasks": [],
- "average_score": 0.2575699315401612
- },
- "2-3 images": {
- "count": 51,
- "num_samples": 802,
- "tasks": [],
- "average_score": 0.3104621543981899
- }
- },
- "app": {
- "Information_Extraction": {
- "count": 72,
- "num_samples": 1124,
- "tasks": [],
- "average_score": 0.4300741596942578
- },
- "Planning": {
- "count": 78,
- "num_samples": 1239,
- "tasks": [],
- "average_score": 0.13622980866275425
- },
- "Coding": {
- "count": 31,
- "num_samples": 474,
- "tasks": [],
- "average_score": 0.2572414987500377
- },
- "Perception": {
- "count": 145,
- "num_samples": 2313,
- "tasks": [],
- "average_score": 0.3892097218585385
- },
- "Metrics": {
- "count": 20,
- "num_samples": 309,
- "tasks": [],
- "average_score": 0.5020540387409291
- },
- "Science": {
- "count": 29,
- "num_samples": 574,
- "tasks": [],
- "average_score": 0.31301986568151985
- },
- "Knowledge": {
- "count": 97,
- "num_samples": 1605,
- "tasks": [],
- "average_score": 0.3809515410188075
- },
- "Mathematics": {
- "count": 33,
- "num_samples": 547,
- "tasks": [],
- "average_score": 0.24222628640267738
- }
- }
- },
- "Llama_3_2_11B": {
- "skills": {
- "Object Recognition and Classification": {
- "count": 303,
- "num_samples": 4755,
- "tasks": [],
- "average_score": 0.1907604552173455
- },
- "Text Recognition (OCR)": {
- "count": 137,
- "num_samples": 2239,
- "tasks": [],
- "average_score": 0.14328677752263275
- },
- "Language Understanding and Generation": {
- "count": 154,
- "num_samples": 2509,
- "tasks": [],
- "average_score": 0.19646404502647707
- },
- "Scene and Event Understanding": {
- "count": 154,
- "num_samples": 2467,
- "tasks": [],
- "average_score": 0.22399113135844315
- },
- "Mathematical and Logical Reasoning": {
- "count": 109,
- "num_samples": 1910,
- "tasks": [],
- "average_score": 0.13303760019716085
- },
- "Commonsense and Social Reasoning": {
- "count": 51,
- "num_samples": 855,
- "tasks": [],
- "average_score": 0.323153603297999
- },
- "Ethical and Safety Reasoning": {
- "count": 15,
- "num_samples": 245,
- "tasks": [],
- "average_score": 0.4260501253132832
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 77,
- "num_samples": 1386,
- "tasks": [],
- "average_score": 0.1770852858056774
- },
- "Spatial and Temporal Reasoning": {
- "count": 152,
- "num_samples": 2437,
- "tasks": [],
- "average_score": 0.15366454315378308
- },
- "Planning and Decision Making": {
- "count": 37,
- "num_samples": 577,
- "tasks": [],
- "average_score": 0.06563884729522687
- }
- },
- "input_format": {
- "User Interface Screenshots": {
- "count": 93,
- "num_samples": 1517,
- "tasks": [],
- "average_score": 0.11886347847341794
- },
- "Text-Based Images and Documents": {
- "count": 82,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.11489351406848371
- },
- "Diagrams and Data Visualizations": {
- "count": 101,
- "num_samples": 1718,
- "tasks": [],
- "average_score": 0.1693681214060816
- },
- "Videos": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.2123769209846321
- },
- "Artistic and Creative Content": {
- "count": 32,
- "num_samples": 541,
- "tasks": [],
- "average_score": 0.2520175802062012
- },
- "Photographs": {
- "count": 143,
- "num_samples": 2248,
- "tasks": [],
- "average_score": 0.2485354956932213
- },
- "3D Models and Aerial Imagery": {
- "count": 11,
- "num_samples": 169,
- "tasks": [],
- "average_score": 0.06418655520777307
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 98,
- "num_samples": 1514,
- "tasks": [],
- "average_score": 0.12417283740525839
- },
- "structured_output": {
- "count": 110,
- "num_samples": 1714,
- "tasks": [],
- "average_score": 0.16374180545556977
- },
- "exact_text": {
- "count": 83,
- "num_samples": 1278,
- "tasks": [],
- "average_score": 0.1576236804437753
- },
- "numerical_data": {
- "count": 49,
- "num_samples": 862,
- "tasks": [],
- "average_score": 0.15014439824913947
- },
- "open_ended_output": {
- "count": 80,
- "num_samples": 1454,
- "tasks": [],
- "average_score": 0.3003142292328822
- },
- "multiple_choice": {
- "count": 85,
- "num_samples": 1363,
- "tasks": [],
- "average_score": 0.19270157739425633
- }
- },
- "input_num": {
- "6-8 images": {
- "count": 21,
- "num_samples": 314,
- "tasks": [],
- "average_score": 0.1463246409674981
- },
- "9-image or more": {
- "count": 41,
- "num_samples": 623,
- "tasks": [],
- "average_score": 0.0732004839476103
- },
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.1960107191983825
- },
- "video": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.2123769209846321
- },
- "4-5 images": {
- "count": 34,
- "num_samples": 520,
- "tasks": [],
- "average_score": 0.1351857051327849
- },
- "2-3 images": {
- "count": 51,
- "num_samples": 802,
- "tasks": [],
- "average_score": 0.18586695387250338
- }
- },
- "app": {
- "Information_Extraction": {
- "count": 72,
- "num_samples": 1124,
- "tasks": [],
- "average_score": 0.17288724679416761
- },
- "Planning": {
- "count": 78,
- "num_samples": 1239,
- "tasks": [],
- "average_score": 0.08100042975820579
- },
- "Coding": {
- "count": 31,
- "num_samples": 474,
- "tasks": [],
- "average_score": 0.0575426944971537
- },
- "Perception": {
- "count": 145,
- "num_samples": 2313,
- "tasks": [],
- "average_score": 0.19899465185565898
- },
- "Metrics": {
- "count": 20,
- "num_samples": 309,
- "tasks": [],
- "average_score": 0.254316961351997
- },
- "Science": {
- "count": 29,
- "num_samples": 574,
- "tasks": [],
- "average_score": 0.162801811963855
- },
- "Knowledge": {
- "count": 97,
- "num_samples": 1605,
- "tasks": [],
- "average_score": 0.28055776664538923
- },
- "Mathematics": {
- "count": 33,
- "num_samples": 547,
- "tasks": [],
- "average_score": 0.13937853323074623
- }
- }
- },
- "Idefics3": {
- "skills": {
- "Object Recognition and Classification": {
- "count": 303,
- "num_samples": 4755,
- "tasks": [],
- "average_score": 0.14507788965553362
- },
- "Text Recognition (OCR)": {
- "count": 137,
- "num_samples": 2239,
- "tasks": [],
- "average_score": 0.11641535161320743
- },
- "Language Understanding and Generation": {
- "count": 154,
- "num_samples": 2509,
- "tasks": [],
- "average_score": 0.17255583910766542
- },
- "Scene and Event Understanding": {
- "count": 154,
- "num_samples": 2467,
- "tasks": [],
- "average_score": 0.14745217246476708
- },
- "Mathematical and Logical Reasoning": {
- "count": 109,
- "num_samples": 1910,
- "tasks": [],
- "average_score": 0.1331851390883708
- },
- "Commonsense and Social Reasoning": {
- "count": 51,
- "num_samples": 855,
- "tasks": [],
- "average_score": 0.19221534222332276
- },
- "Ethical and Safety Reasoning": {
- "count": 15,
- "num_samples": 245,
- "tasks": [],
- "average_score": 0.28640852130325817
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 77,
- "num_samples": 1386,
- "tasks": [],
- "average_score": 0.17906399043310475
- },
- "Spatial and Temporal Reasoning": {
- "count": 152,
- "num_samples": 2437,
- "tasks": [],
- "average_score": 0.10192930055370109
- },
- "Planning and Decision Making": {
- "count": 37,
- "num_samples": 577,
- "tasks": [],
- "average_score": 0.04211916597550756
- }
- },
- "input_format": {
- "User Interface Screenshots": {
- "count": 93,
- "num_samples": 1517,
- "tasks": [],
- "average_score": 0.10126271262360581
- },
- "Text-Based Images and Documents": {
- "count": 82,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.11407926733108291
- },
- "Diagrams and Data Visualizations": {
- "count": 101,
- "num_samples": 1718,
- "tasks": [],
- "average_score": 0.16225217317782772
- },
- "Videos": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.16181866973635636
- },
- "Artistic and Creative Content": {
- "count": 32,
- "num_samples": 541,
- "tasks": [],
- "average_score": 0.1839408679813373
- },
- "Photographs": {
- "count": 143,
- "num_samples": 2248,
- "tasks": [],
- "average_score": 0.14933801491626408
- },
- "3D Models and Aerial Imagery": {
- "count": 11,
- "num_samples": 169,
- "tasks": [],
- "average_score": 0.0395540896656236
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 98,
- "num_samples": 1514,
- "tasks": [],
- "average_score": 0.13979628998424784
- },
- "structured_output": {
- "count": 110,
- "num_samples": 1714,
- "tasks": [],
- "average_score": 0.1062779093260333
- },
- "exact_text": {
- "count": 83,
- "num_samples": 1278,
- "tasks": [],
- "average_score": 0.07053056796593082
- },
- "numerical_data": {
- "count": 49,
- "num_samples": 862,
- "tasks": [],
- "average_score": 0.09790172378722654
- },
- "open_ended_output": {
- "count": 80,
- "num_samples": 1454,
- "tasks": [],
- "average_score": 0.2987797010800956
- },
- "multiple_choice": {
- "count": 85,
- "num_samples": 1363,
- "tasks": [],
- "average_score": 0.11588163814170001
- }
- },
- "input_num": {
- "6-8 images": {
- "count": 21,
- "num_samples": 314,
- "tasks": [],
- "average_score": 0.1008692365835223
- },
- "9-image or more": {
- "count": 41,
- "num_samples": 623,
- "tasks": [],
- "average_score": 0.09308121224497533
- },
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.14757589734485796
- },
- "video": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.16181866973635636
- },
- "4-5 images": {
- "count": 34,
- "num_samples": 520,
- "tasks": [],
- "average_score": 0.12217834249866026
- },
- "2-3 images": {
- "count": 51,
- "num_samples": 802,
- "tasks": [],
- "average_score": 0.12276246278377517
- }
- },
- "app": {
- "Information_Extraction": {
- "count": 72,
- "num_samples": 1124,
- "tasks": [],
- "average_score": 0.14743542163139847
- },
- "Planning": {
- "count": 78,
- "num_samples": 1239,
- "tasks": [],
- "average_score": 0.05354869594691955
- },
- "Coding": {
- "count": 31,
- "num_samples": 474,
- "tasks": [],
- "average_score": 0.09065540194572455
- },
- "Perception": {
- "count": 145,
- "num_samples": 2313,
- "tasks": [],
- "average_score": 0.1463280929280822
- },
- "Metrics": {
- "count": 20,
- "num_samples": 309,
- "tasks": [],
- "average_score": 0.14564374862578883
- },
- "Science": {
- "count": 29,
- "num_samples": 574,
- "tasks": [],
- "average_score": 0.22748773785486257
- },
- "Knowledge": {
- "count": 97,
- "num_samples": 1605,
- "tasks": [],
- "average_score": 0.17647756032677067
- },
- "Mathematics": {
- "count": 33,
- "num_samples": 547,
- "tasks": [],
- "average_score": 0.13168972973651977
- }
- }
- },
- "Aria": {
- "skills": {
- "Object Recognition and Classification": {
- "count": 303,
- "num_samples": 4755,
- "tasks": [],
- "average_score": 0.3264829094772722
- },
- "Text Recognition (OCR)": {
- "count": 137,
- "num_samples": 2239,
- "tasks": [],
- "average_score": 0.35712138797286674
- },
- "Language Understanding and Generation": {
- "count": 154,
- "num_samples": 2509,
- "tasks": [],
- "average_score": 0.4004806395853317
- },
- "Scene and Event Understanding": {
- "count": 154,
- "num_samples": 2467,
- "tasks": [],
- "average_score": 0.3783082688258977
- },
- "Mathematical and Logical Reasoning": {
- "count": 109,
- "num_samples": 1910,
- "tasks": [],
- "average_score": 0.27628131703993153
- },
- "Commonsense and Social Reasoning": {
- "count": 51,
- "num_samples": 855,
- "tasks": [],
- "average_score": 0.4942870225393938
- },
- "Ethical and Safety Reasoning": {
- "count": 15,
- "num_samples": 245,
- "tasks": [],
- "average_score": 0.5811228070175439
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 77,
- "num_samples": 1386,
- "tasks": [],
- "average_score": 0.3279996334048362
- },
- "Spatial and Temporal Reasoning": {
- "count": 152,
- "num_samples": 2437,
- "tasks": [],
- "average_score": 0.2481896092177717
- },
- "Planning and Decision Making": {
- "count": 37,
- "num_samples": 577,
- "tasks": [],
- "average_score": 0.11945216302285933
- }
- },
- "input_format": {
- "User Interface Screenshots": {
- "count": 93,
- "num_samples": 1517,
- "tasks": [],
- "average_score": 0.2830308005758272
- },
- "Text-Based Images and Documents": {
- "count": 82,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.27833423130489043
- },
- "Diagrams and Data Visualizations": {
- "count": 101,
- "num_samples": 1718,
- "tasks": [],
- "average_score": 0.32371820359400666
- },
- "Videos": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.42875359425696014
- },
- "Artistic and Creative Content": {
- "count": 32,
- "num_samples": 541,
- "tasks": [],
- "average_score": 0.3612041984219992
- },
- "Photographs": {
- "count": 143,
- "num_samples": 2248,
- "tasks": [],
- "average_score": 0.37290568595471846
- },
- "3D Models and Aerial Imagery": {
- "count": 11,
- "num_samples": 169,
- "tasks": [],
- "average_score": 0.19554976321164697
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 98,
- "num_samples": 1514,
- "tasks": [],
- "average_score": 0.3092653492193887
- },
- "structured_output": {
- "count": 110,
- "num_samples": 1714,
- "tasks": [],
- "average_score": 0.3043751656077328
- },
- "exact_text": {
- "count": 83,
- "num_samples": 1278,
- "tasks": [],
- "average_score": 0.2930015244066511
- },
- "numerical_data": {
- "count": 49,
- "num_samples": 862,
- "tasks": [],
- "average_score": 0.3092167834876797
- },
- "open_ended_output": {
- "count": 80,
- "num_samples": 1454,
- "tasks": [],
- "average_score": 0.4523860109667709
- },
- "multiple_choice": {
- "count": 85,
- "num_samples": 1363,
- "tasks": [],
- "average_score": 0.3277812604542708
- }
- },
- "input_num": {
- "6-8 images": {
- "count": 21,
- "num_samples": 314,
- "tasks": [],
- "average_score": 0.21139455782312927
- },
- "9-image or more": {
- "count": 41,
- "num_samples": 623,
- "tasks": [],
- "average_score": 0.2711617723374526
- },
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.3576735443060994
- },
- "video": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.42875359425696014
- },
- "4-5 images": {
- "count": 34,
- "num_samples": 520,
- "tasks": [],
- "average_score": 0.19839956701033565
- },
- "2-3 images": {
- "count": 51,
- "num_samples": 802,
- "tasks": [],
- "average_score": 0.27267126872569447
- }
- },
- "app": {
- "Information_Extraction": {
- "count": 72,
- "num_samples": 1124,
- "tasks": [],
- "average_score": 0.38321397541649777
- },
- "Planning": {
- "count": 78,
- "num_samples": 1239,
- "tasks": [],
- "average_score": 0.14301905320436192
- },
- "Coding": {
- "count": 31,
- "num_samples": 474,
- "tasks": [],
- "average_score": 0.2849545194421855
- },
- "Perception": {
- "count": 145,
- "num_samples": 2313,
- "tasks": [],
- "average_score": 0.3779947327886569
- },
- "Metrics": {
- "count": 20,
- "num_samples": 309,
- "tasks": [],
- "average_score": 0.39678729061309725
- },
- "Science": {
- "count": 29,
- "num_samples": 574,
- "tasks": [],
- "average_score": 0.29682445889316517
- },
- "Knowledge": {
- "count": 97,
- "num_samples": 1605,
- "tasks": [],
- "average_score": 0.4096377585306089
- },
- "Mathematics": {
- "count": 33,
- "num_samples": 547,
- "tasks": [],
- "average_score": 0.26194160419181234
- }
- }
- },
- "NVLM": {
- "skills": {
- "Object Recognition and Classification": {
- "count": 303,
- "num_samples": 4755,
- "tasks": [],
- "average_score": 0.24033557047857043
- },
- "Text Recognition (OCR)": {
- "count": 137,
- "num_samples": 2239,
- "tasks": [],
- "average_score": 0.32154059695494047
- },
- "Language Understanding and Generation": {
- "count": 154,
- "num_samples": 2509,
- "tasks": [],
- "average_score": 0.2937052996171993
- },
- "Scene and Event Understanding": {
- "count": 154,
- "num_samples": 2467,
- "tasks": [],
- "average_score": 0.22845955700594492
- },
- "Mathematical and Logical Reasoning": {
- "count": 109,
- "num_samples": 1910,
- "tasks": [],
- "average_score": 0.2639741933075709
- },
- "Commonsense and Social Reasoning": {
- "count": 51,
- "num_samples": 855,
- "tasks": [],
- "average_score": 0.40870864071047447
- },
- "Ethical and Safety Reasoning": {
- "count": 15,
- "num_samples": 245,
- "tasks": [],
- "average_score": 0.4555238095238095
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 77,
- "num_samples": 1386,
- "tasks": [],
- "average_score": 0.25785191641267197
- },
- "Spatial and Temporal Reasoning": {
- "count": 152,
- "num_samples": 2437,
- "tasks": [],
- "average_score": 0.15679681195908274
- },
- "Planning and Decision Making": {
- "count": 37,
- "num_samples": 577,
- "tasks": [],
- "average_score": 0.0672259242345112
- }
- },
- "input_format": {
- "User Interface Screenshots": {
- "count": 93,
- "num_samples": 1517,
- "tasks": [],
- "average_score": 0.23922823287047076
- },
- "Text-Based Images and Documents": {
- "count": 82,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.21734036617042948
- },
- "Diagrams and Data Visualizations": {
- "count": 101,
- "num_samples": 1718,
- "tasks": [],
- "average_score": 0.30313485498585124
- },
- "Videos": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.0
- },
- "Artistic and Creative Content": {
- "count": 32,
- "num_samples": 541,
- "tasks": [],
- "average_score": 0.34726189956094355
- },
- "Photographs": {
- "count": 143,
- "num_samples": 2248,
- "tasks": [],
- "average_score": 0.3264757655296162
- },
- "3D Models and Aerial Imagery": {
- "count": 11,
- "num_samples": 169,
- "tasks": [],
- "average_score": 0.056894830390305184
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 98,
- "num_samples": 1514,
- "tasks": [],
- "average_score": 0.22868389095927066
- },
- "structured_output": {
- "count": 110,
- "num_samples": 1714,
- "tasks": [],
- "average_score": 0.2788963949121424
- },
- "exact_text": {
- "count": 83,
- "num_samples": 1278,
- "tasks": [],
- "average_score": 0.2787764976961992
- },
- "numerical_data": {
- "count": 49,
- "num_samples": 862,
- "tasks": [],
- "average_score": 0.23349712171444964
- },
- "open_ended_output": {
- "count": 80,
- "num_samples": 1454,
- "tasks": [],
- "average_score": 0.3215948035793096
- },
- "multiple_choice": {
- "count": 85,
- "num_samples": 1363,
- "tasks": [],
- "average_score": 0.18487055428231897
- }
- },
- "input_num": {
- "6-8 images": {
- "count": 21,
- "num_samples": 314,
- "tasks": [],
- "average_score": 0.0
- },
- "9-image or more": {
- "count": 41,
- "num_samples": 623,
- "tasks": [],
- "average_score": 0.0
- },
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.3680809151131777
- },
- "video": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.0
- },
- "4-5 images": {
- "count": 34,
- "num_samples": 520,
- "tasks": [],
- "average_score": 0.03838410364145658
- },
- "2-3 images": {
- "count": 51,
- "num_samples": 802,
- "tasks": [],
- "average_score": 0.2325581694709435
- }
- },
- "app": {
- "Information_Extraction": {
- "count": 72,
- "num_samples": 1124,
- "tasks": [],
- "average_score": 0.22773778915303383
- },
- "Planning": {
- "count": 78,
- "num_samples": 1239,
- "tasks": [],
- "average_score": 0.08048160660797504
- },
- "Coding": {
- "count": 31,
- "num_samples": 474,
- "tasks": [],
- "average_score": 0.2390024647851972
- },
- "Perception": {
- "count": 145,
- "num_samples": 2313,
- "tasks": [],
- "average_score": 0.30211261814126533
- },
- "Metrics": {
- "count": 20,
- "num_samples": 309,
- "tasks": [],
- "average_score": 0.18857142857142856
- },
- "Science": {
- "count": 29,
- "num_samples": 574,
- "tasks": [],
- "average_score": 0.24908307640275493
- },
- "Knowledge": {
- "count": 97,
- "num_samples": 1605,
- "tasks": [],
- "average_score": 0.3724877947012685
- },
- "Mathematics": {
- "count": 33,
- "num_samples": 547,
- "tasks": [],
- "average_score": 0.24529601154794037
- }
- }
- },
- "InternVL2_2B": {
- "skills": {
- "Object Recognition and Classification": {
- "count": 303,
- "num_samples": 4755,
- "tasks": [],
- "average_score": 0.14491178903291552
- },
- "Text Recognition (OCR)": {
- "count": 137,
- "num_samples": 2239,
- "tasks": [],
- "average_score": 0.12126906675624163
- },
- "Language Understanding and Generation": {
- "count": 154,
- "num_samples": 2509,
- "tasks": [],
- "average_score": 0.16912754929321935
- },
- "Scene and Event Understanding": {
- "count": 154,
- "num_samples": 2467,
- "tasks": [],
- "average_score": 0.18542274192083463
- },
- "Mathematical and Logical Reasoning": {
- "count": 109,
- "num_samples": 1910,
- "tasks": [],
- "average_score": 0.13923308734553164
- },
- "Commonsense and Social Reasoning": {
- "count": 51,
- "num_samples": 855,
- "tasks": [],
- "average_score": 0.23992252224543772
- },
- "Ethical and Safety Reasoning": {
- "count": 15,
- "num_samples": 245,
- "tasks": [],
- "average_score": 0.3420927318295739
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 77,
- "num_samples": 1386,
- "tasks": [],
- "average_score": 0.14807577209152425
- },
- "Spatial and Temporal Reasoning": {
- "count": 152,
- "num_samples": 2437,
- "tasks": [],
- "average_score": 0.13036555933925006
- },
- "Planning and Decision Making": {
- "count": 37,
- "num_samples": 577,
- "tasks": [],
- "average_score": 0.01727799227799228
- }
- },
- "input_format": {
- "User Interface Screenshots": {
- "count": 93,
- "num_samples": 1517,
- "tasks": [],
- "average_score": 0.057021136657850864
- },
- "Text-Based Images and Documents": {
- "count": 82,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.10504085961245285
- },
- "Diagrams and Data Visualizations": {
- "count": 101,
- "num_samples": 1718,
- "tasks": [],
- "average_score": 0.1625198552182714
- },
- "Videos": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.18999779001767986
- },
- "Artistic and Creative Content": {
- "count": 32,
- "num_samples": 541,
- "tasks": [],
- "average_score": 0.1487677475708977
- },
- "Photographs": {
- "count": 143,
- "num_samples": 2248,
- "tasks": [],
- "average_score": 0.2011727338536935
- },
- "3D Models and Aerial Imagery": {
- "count": 11,
- "num_samples": 169,
- "tasks": [],
- "average_score": 0.11886936592818943
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 98,
- "num_samples": 1514,
- "tasks": [],
- "average_score": 0.1131404778887607
- },
- "structured_output": {
- "count": 110,
- "num_samples": 1714,
- "tasks": [],
- "average_score": 0.05739750616837997
- },
- "exact_text": {
- "count": 83,
- "num_samples": 1278,
- "tasks": [],
- "average_score": 0.15465451663650032
- },
- "numerical_data": {
- "count": 49,
- "num_samples": 862,
- "tasks": [],
- "average_score": 0.16044698450090833
- },
- "open_ended_output": {
- "count": 80,
- "num_samples": 1454,
- "tasks": [],
- "average_score": 0.21429521387724249
- },
- "multiple_choice": {
- "count": 85,
- "num_samples": 1363,
- "tasks": [],
- "average_score": 0.2128614316540013
- }
- },
- "input_num": {
- "6-8 images": {
- "count": 21,
- "num_samples": 314,
- "tasks": [],
- "average_score": 0.03658352229780801
- },
- "9-image or more": {
- "count": 41,
- "num_samples": 623,
- "tasks": [],
- "average_score": 0.05757839721254354
- },
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.15225683687839608
- },
- "video": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.18999779001767986
- },
- "4-5 images": {
- "count": 34,
- "num_samples": 520,
- "tasks": [],
- "average_score": 0.17677460549936644
- },
- "2-3 images": {
- "count": 51,
- "num_samples": 802,
- "tasks": [],
- "average_score": 0.158165588340436
- }
- },
- "app": {
- "Information_Extraction": {
- "count": 72,
- "num_samples": 1124,
- "tasks": [],
- "average_score": 0.08722661966805
- },
- "Planning": {
- "count": 78,
- "num_samples": 1239,
- "tasks": [],
- "average_score": 0.04102853815875594
- },
- "Coding": {
- "count": 31,
- "num_samples": 474,
- "tasks": [],
- "average_score": 0.11264043251709285
- },
- "Perception": {
- "count": 145,
- "num_samples": 2313,
- "tasks": [],
- "average_score": 0.17001758160301803
- },
- "Metrics": {
- "count": 20,
- "num_samples": 309,
- "tasks": [],
- "average_score": 0.3332891958712894
- },
- "Science": {
- "count": 29,
- "num_samples": 574,
- "tasks": [],
- "average_score": 0.1686125516807394
- },
- "Knowledge": {
- "count": 97,
- "num_samples": 1605,
- "tasks": [],
- "average_score": 0.21169137106199268
- },
- "Mathematics": {
- "count": 33,
- "num_samples": 547,
- "tasks": [],
- "average_score": 0.10975764217070672
- }
- }
- },
- "Qwen2_VL_2B": {
- "skills": {
- "Object Recognition and Classification": {
- "count": 303,
- "num_samples": 4755,
- "tasks": [],
- "average_score": 0.22236161923122505
- },
- "Text Recognition (OCR)": {
- "count": 137,
- "num_samples": 2239,
- "tasks": [],
- "average_score": 0.23701014663017753
- },
- "Language Understanding and Generation": {
- "count": 154,
- "num_samples": 2509,
- "tasks": [],
- "average_score": 0.25669221785292334
- },
- "Scene and Event Understanding": {
- "count": 154,
- "num_samples": 2467,
- "tasks": [],
- "average_score": 0.26526414975225454
- },
- "Mathematical and Logical Reasoning": {
- "count": 109,
- "num_samples": 1910,
- "tasks": [],
- "average_score": 0.17623548305581763
- },
- "Commonsense and Social Reasoning": {
- "count": 51,
- "num_samples": 855,
- "tasks": [],
- "average_score": 0.31250702198481506
- },
- "Ethical and Safety Reasoning": {
- "count": 15,
- "num_samples": 245,
- "tasks": [],
- "average_score": 0.4140676691729323
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 77,
- "num_samples": 1386,
- "tasks": [],
- "average_score": 0.20802820480076603
- },
- "Spatial and Temporal Reasoning": {
- "count": 152,
- "num_samples": 2437,
- "tasks": [],
- "average_score": 0.17320633068307653
- },
- "Planning and Decision Making": {
- "count": 37,
- "num_samples": 577,
- "tasks": [],
- "average_score": 0.06209506566980099
- }
- },
- "input_format": {
- "User Interface Screenshots": {
- "count": 93,
- "num_samples": 1517,
- "tasks": [],
- "average_score": 0.190837839372028
- },
- "Text-Based Images and Documents": {
- "count": 82,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.16287824421269087
- },
- "Diagrams and Data Visualizations": {
- "count": 101,
- "num_samples": 1718,
- "tasks": [],
- "average_score": 0.19640906475019812
- },
- "Videos": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.2520741776922928
- },
- "Artistic and Creative Content": {
- "count": 32,
- "num_samples": 541,
- "tasks": [],
- "average_score": 0.24883076673424442
- },
- "Photographs": {
- "count": 143,
- "num_samples": 2248,
- "tasks": [],
- "average_score": 0.2877316297453947
- },
- "3D Models and Aerial Imagery": {
- "count": 11,
- "num_samples": 169,
- "tasks": [],
- "average_score": 0.13398525561847363
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 98,
- "num_samples": 1514,
- "tasks": [],
- "average_score": 0.1624451002757208
- },
- "structured_output": {
- "count": 110,
- "num_samples": 1714,
- "tasks": [],
- "average_score": 0.20960092816529263
- },
- "exact_text": {
- "count": 83,
- "num_samples": 1278,
- "tasks": [],
- "average_score": 0.19986806708136184
- },
- "numerical_data": {
- "count": 49,
- "num_samples": 862,
- "tasks": [],
- "average_score": 0.2201024015934558
- },
- "open_ended_output": {
- "count": 80,
- "num_samples": 1454,
- "tasks": [],
- "average_score": 0.30248748033122763
- },
- "multiple_choice": {
- "count": 85,
- "num_samples": 1363,
- "tasks": [],
- "average_score": 0.256631742010999
- }
- },
- "input_num": {
- "6-8 images": {
- "count": 21,
- "num_samples": 314,
- "tasks": [],
- "average_score": 0.07681405895691609
- },
- "9-image or more": {
- "count": 41,
- "num_samples": 623,
- "tasks": [],
- "average_score": 0.10526691703628158
- },
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.25018977062352593
- },
- "video": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.2520741776922928
- },
- "4-5 images": {
- "count": 34,
- "num_samples": 520,
- "tasks": [],
- "average_score": 0.17435940889565366
- },
- "2-3 images": {
- "count": 51,
- "num_samples": 802,
- "tasks": [],
- "average_score": 0.21286783416184518
- }
- },
- "app": {
- "Information_Extraction": {
- "count": 72,
- "num_samples": 1124,
- "tasks": [],
- "average_score": 0.2521972668785968
- },
- "Planning": {
- "count": 78,
- "num_samples": 1239,
- "tasks": [],
- "average_score": 0.06967138760493456
- },
- "Coding": {
- "count": 31,
- "num_samples": 474,
- "tasks": [],
- "average_score": 0.16996250112948405
- },
- "Perception": {
- "count": 145,
- "num_samples": 2313,
- "tasks": [],
- "average_score": 0.27603334911345223
- },
- "Metrics": {
- "count": 20,
- "num_samples": 309,
- "tasks": [],
- "average_score": 0.31002436092347696
- },
- "Science": {
- "count": 29,
- "num_samples": 574,
- "tasks": [],
- "average_score": 0.21061929716065056
- },
- "Knowledge": {
- "count": 97,
- "num_samples": 1605,
- "tasks": [],
- "average_score": 0.2656728023444808
- },
- "Mathematics": {
- "count": 33,
- "num_samples": 547,
- "tasks": [],
- "average_score": 0.16356158787929762
- }
- }
- },
- "Aquila_VL_2B": {
- "skills": {
- "Object Recognition and Classification": {
- "count": 303,
- "num_samples": 4755,
- "tasks": [],
- "average_score": 0.18420666660337692
- },
- "Text Recognition (OCR)": {
- "count": 137,
- "num_samples": 2239,
- "tasks": [],
- "average_score": 0.12395530240359122
- },
- "Language Understanding and Generation": {
- "count": 154,
- "num_samples": 2509,
- "tasks": [],
- "average_score": 0.17924536722051596
- },
- "Scene and Event Understanding": {
- "count": 154,
- "num_samples": 2467,
- "tasks": [],
- "average_score": 0.220108610660707
- },
- "Mathematical and Logical Reasoning": {
- "count": 109,
- "num_samples": 1910,
- "tasks": [],
- "average_score": 0.1680749869910155
- },
- "Commonsense and Social Reasoning": {
- "count": 51,
- "num_samples": 855,
- "tasks": [],
- "average_score": 0.26630477322766793
- },
- "Ethical and Safety Reasoning": {
- "count": 15,
- "num_samples": 245,
- "tasks": [],
- "average_score": 0.35152130325814535
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 77,
- "num_samples": 1386,
- "tasks": [],
- "average_score": 0.1857154485444521
- },
- "Spatial and Temporal Reasoning": {
- "count": 152,
- "num_samples": 2437,
- "tasks": [],
- "average_score": 0.1616397700608881
- },
- "Planning and Decision Making": {
- "count": 37,
- "num_samples": 577,
- "tasks": [],
- "average_score": 0.044513236949565
- }
- },
- "input_format": {
- "User Interface Screenshots": {
- "count": 93,
- "num_samples": 1517,
- "tasks": [],
- "average_score": 0.07480350331940272
- },
- "Text-Based Images and Documents": {
- "count": 82,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.11444110320621242
- },
- "Diagrams and Data Visualizations": {
- "count": 101,
- "num_samples": 1718,
- "tasks": [],
- "average_score": 0.19412275574929044
- },
- "Videos": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.21367350061199514
- },
- "Artistic and Creative Content": {
- "count": 32,
- "num_samples": 541,
- "tasks": [],
- "average_score": 0.19717811128156643
- },
- "Photographs": {
- "count": 143,
- "num_samples": 2248,
- "tasks": [],
- "average_score": 0.24620947964695974
- },
- "3D Models and Aerial Imagery": {
- "count": 11,
- "num_samples": 169,
- "tasks": [],
- "average_score": 0.10131259529340846
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 98,
- "num_samples": 1514,
- "tasks": [],
- "average_score": 0.11925340914357861
- },
- "structured_output": {
- "count": 110,
- "num_samples": 1714,
- "tasks": [],
- "average_score": 0.123417109500157
- },
- "exact_text": {
- "count": 83,
- "num_samples": 1278,
- "tasks": [],
- "average_score": 0.18474924824567768
- },
- "numerical_data": {
- "count": 49,
- "num_samples": 862,
- "tasks": [],
- "average_score": 0.19908864029107046
- },
- "open_ended_output": {
- "count": 80,
- "num_samples": 1454,
- "tasks": [],
- "average_score": 0.23278612647548963
- },
- "multiple_choice": {
- "count": 85,
- "num_samples": 1363,
- "tasks": [],
- "average_score": 0.22108484223035305
- }
- },
- "input_num": {
- "6-8 images": {
- "count": 21,
- "num_samples": 314,
- "tasks": [],
- "average_score": 0.11057256235827662
- },
- "9-image or more": {
- "count": 41,
- "num_samples": 623,
- "tasks": [],
- "average_score": 0.011631871744697361
- },
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.18240049845355885
- },
- "video": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.21367350061199514
- },
- "4-5 images": {
- "count": 34,
- "num_samples": 520,
- "tasks": [],
- "average_score": 0.1898373110613516
- },
- "2-3 images": {
- "count": 51,
- "num_samples": 802,
- "tasks": [],
- "average_score": 0.23274180707905315
- }
- },
- "app": {
- "Information_Extraction": {
- "count": 72,
- "num_samples": 1124,
- "tasks": [],
- "average_score": 0.09484068019620011
- },
- "Planning": {
- "count": 78,
- "num_samples": 1239,
- "tasks": [],
- "average_score": 0.05864269260897992
- },
- "Coding": {
- "count": 31,
- "num_samples": 474,
- "tasks": [],
- "average_score": 0.13323092677931386
- },
- "Perception": {
- "count": 145,
- "num_samples": 2313,
- "tasks": [],
- "average_score": 0.20714098741611
- },
- "Metrics": {
- "count": 20,
- "num_samples": 309,
- "tasks": [],
- "average_score": 0.2932627505936196
- },
- "Science": {
- "count": 29,
- "num_samples": 574,
- "tasks": [],
- "average_score": 0.21075421274487907
- },
- "Knowledge": {
- "count": 97,
- "num_samples": 1605,
- "tasks": [],
- "average_score": 0.24110595572817994
- },
- "Mathematics": {
- "count": 33,
- "num_samples": 547,
- "tasks": [],
- "average_score": 0.20711160718581811
- }
- }
- },
- "Mammoth_VL": {
- "skills": {
- "Object Recognition and Classification": {
- "count": 303,
- "num_samples": 4755,
- "tasks": [],
- "average_score": 0.30194776127683565
- },
- "Text Recognition (OCR)": {
- "count": 137,
- "num_samples": 2239,
- "tasks": [],
- "average_score": 0.2365295791606494
- },
- "Language Understanding and Generation": {
- "count": 154,
- "num_samples": 2509,
- "tasks": [],
- "average_score": 0.2993927028494267
- },
- "Scene and Event Understanding": {
- "count": 154,
- "num_samples": 2467,
- "tasks": [],
- "average_score": 0.3366347826116991
- },
- "Mathematical and Logical Reasoning": {
- "count": 109,
- "num_samples": 1910,
- "tasks": [],
- "average_score": 0.2408454736444444
- },
- "Commonsense and Social Reasoning": {
- "count": 51,
- "num_samples": 855,
- "tasks": [],
- "average_score": 0.37895522991264047
- },
- "Ethical and Safety Reasoning": {
- "count": 15,
- "num_samples": 245,
- "tasks": [],
- "average_score": 0.48003508771929826
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 77,
- "num_samples": 1386,
- "tasks": [],
- "average_score": 0.27232427744946475
- },
- "Spatial and Temporal Reasoning": {
- "count": 152,
- "num_samples": 2437,
- "tasks": [],
- "average_score": 0.24522937191710698
- },
- "Planning and Decision Making": {
- "count": 37,
- "num_samples": 577,
- "tasks": [],
- "average_score": 0.11457024299726488
- }
- },
- "input_format": {
- "User Interface Screenshots": {
- "count": 93,
- "num_samples": 1517,
- "tasks": [],
- "average_score": 0.18941525254390731
- },
- "Text-Based Images and Documents": {
- "count": 82,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.1718334741390191
- },
- "Diagrams and Data Visualizations": {
- "count": 101,
- "num_samples": 1718,
- "tasks": [],
- "average_score": 0.28108187023954245
- },
- "Videos": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.3391119999611432
- },
- "Artistic and Creative Content": {
- "count": 32,
- "num_samples": 541,
- "tasks": [],
- "average_score": 0.36434285930327387
- },
- "Photographs": {
- "count": 143,
- "num_samples": 2248,
- "tasks": [],
- "average_score": 0.36915384448504296
- },
- "3D Models and Aerial Imagery": {
- "count": 11,
- "num_samples": 169,
- "tasks": [],
- "average_score": 0.15940750469262005
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 98,
- "num_samples": 1514,
- "tasks": [],
- "average_score": 0.2456942956200745
- },
- "structured_output": {
- "count": 110,
- "num_samples": 1714,
- "tasks": [],
- "average_score": 0.21586513216389874
- },
- "exact_text": {
- "count": 83,
- "num_samples": 1278,
- "tasks": [],
- "average_score": 0.29359048024032264
- },
- "numerical_data": {
- "count": 49,
- "num_samples": 862,
- "tasks": [],
- "average_score": 0.2646677074112521
- },
- "open_ended_output": {
- "count": 80,
- "num_samples": 1454,
- "tasks": [],
- "average_score": 0.34733130661096645
- },
- "multiple_choice": {
- "count": 85,
- "num_samples": 1363,
- "tasks": [],
- "average_score": 0.3286125236284589
- }
- },
- "input_num": {
- "6-8 images": {
- "count": 21,
- "num_samples": 314,
- "tasks": [],
- "average_score": 0.16358654572940287
- },
- "9-image or more": {
- "count": 41,
- "num_samples": 623,
- "tasks": [],
- "average_score": 0.25463059203015115
- },
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.2919119209789575
- },
- "video": {
- "count": 43,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.3391119999611432
- },
- "4-5 images": {
- "count": 34,
- "num_samples": 520,
- "tasks": [],
- "average_score": 0.20016011839130254
- },
- "2-3 images": {
- "count": 51,
- "num_samples": 802,
- "tasks": [],
- "average_score": 0.2679179451692527
- }
- },
- "app": {
- "Information_Extraction": {
- "count": 72,
- "num_samples": 1124,
- "tasks": [],
- "average_score": 0.23600902063965679
- },
- "Planning": {
- "count": 78,
- "num_samples": 1239,
- "tasks": [],
- "average_score": 0.15326915093278803
- },
- "Coding": {
- "count": 31,
- "num_samples": 474,
- "tasks": [],
- "average_score": 0.20668466311255687
- },
- "Perception": {
- "count": 145,
- "num_samples": 2313,
- "tasks": [],
- "average_score": 0.33348955971237954
- },
- "Metrics": {
- "count": 20,
- "num_samples": 309,
- "tasks": [],
- "average_score": 0.3759170425350556
- },
- "Science": {
- "count": 29,
- "num_samples": 574,
- "tasks": [],
- "average_score": 0.23894961766260706
- },
- "Knowledge": {
- "count": 97,
- "num_samples": 1605,
- "tasks": [],
- "average_score": 0.351703435685048
- },
- "Mathematics": {
- "count": 33,
- "num_samples": 547,
- "tasks": [],
- "average_score": 0.26074348700688493
- }
- }
- }
-}
\ No newline at end of file
diff --git a/static/eval_results/Default/all_summary.json b/static/eval_results/Default/all_summary.json
deleted file mode 100644
index e82fc987307418940619aab4a3a374e74c684b19..0000000000000000000000000000000000000000
--- a/static/eval_results/Default/all_summary.json
+++ /dev/null
@@ -1,525 +0,0 @@
-{
- "GPT_4o": {
- "core_noncot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.5203440930873326,
- "micro_mean_score": 0.514302640282204
- },
- "core_cot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.5265030595065238,
- "micro_mean_score": 0.5236338521693411
- },
- "open": {
- "num_eval_tasks": 65,
- "num_eval_samples": 1163,
- "macro_mean_score": 0.6478225794744895,
- "micro_mean_score": 0.665391229578676
- },
- "overall_score": 0.5421184432647768
- },
- "Gemini_1.5_pro_002": {
- "core_noncot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.4699992918320008,
- "micro_mean_score": 0.4651116133689296
- },
- "core_cot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.4822473962867704,
- "micro_mean_score": 0.4764805563057179
- },
- "open": {
- "num_eval_tasks": 65,
- "num_eval_samples": 1163,
- "macro_mean_score": 0.5858190649927173,
- "micro_mean_score": 0.6104901117798793
- },
- "overall_score": 0.4955784031499121
- },
- "Gemini_1.5_flash_002": {
- "core_noncot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.41898948981774853,
- "micro_mean_score": 0.4127376993779598
- },
- "core_cot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.4189319021967416,
- "micro_mean_score": 0.41567515414375245
- },
- "open": {
- "num_eval_tasks": 65,
- "num_eval_samples": 1163,
- "macro_mean_score": 0.5691365176285039,
- "micro_mean_score": 0.5987532244196045
- },
- "overall_score": 0.43831534488249924
- },
- "Claude_3.5": {
- "core_noncot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.48800427486796155,
- "micro_mean_score": 0.4814327812005499
- },
- "core_cot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.5040975742801586,
- "micro_mean_score": 0.5002259116666758
- },
- "open": {
- "num_eval_tasks": 65,
- "num_eval_samples": 1163,
- "macro_mean_score": 0.6373907158949892,
- "micro_mean_score": 0.6569647463456579
- },
- "overall_score": 0.5212541172602853
- },
- "Claude_3.5_new": {
- "core_noncot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.4919657684484185,
- "micro_mean_score": 0.4874520567007144
- },
- "core_cot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.5259191914020757,
- "micro_mean_score": 0.5230785894131227
- },
- "open": {
- "num_eval_tasks": 65,
- "num_eval_samples": 1163,
- "macro_mean_score": 0.6563419761104125,
- "micro_mean_score": 0.6724419604471196
- },
- "overall_score": 0.5427062825031487
- },
- "GPT_4o_mini": {
- "core_noncot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.39854757130003565,
- "micro_mean_score": 0.3936551517403452
- },
- "core_cot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.40767494558789397,
- "micro_mean_score": 0.40431644154143376
- },
- "open": {
- "num_eval_tasks": 65,
- "num_eval_samples": 1163,
- "macro_mean_score": 0.586537827213665,
- "micro_mean_score": 0.6133276010318144
- },
- "overall_score": 0.43069690064863675
- },
- "Qwen2_VL_72B": {
- "core_noncot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.46406654108789214,
- "micro_mean_score": 0.4584702152011697
- },
- "core_cot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.4542376574527161,
- "micro_mean_score": 0.4501201906164793
- },
- "open": {
- "num_eval_tasks": 65,
- "num_eval_samples": 1163,
- "macro_mean_score": 0.5639771804231668,
- "micro_mean_score": 0.5835339638865004
- },
- "overall_score": 0.4769263263488681
- },
- "Qwen2_VL_7B": {
- "core_noncot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.3480020832611913,
- "micro_mean_score": 0.3441858958345098
- },
- "core_cot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.3293449599230247,
- "micro_mean_score": 0.325331493515679
- },
- "open": {
- "num_eval_tasks": 65,
- "num_eval_samples": 1170,
- "macro_mean_score": 0.43955105763038577,
- "micro_mean_score": 0.45508547008546996
- },
- "overall_score": 0.3597856146156421
- },
- "llava_onevision_72B": {
- "core_noncot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.3199332158220174,
- "micro_mean_score": 0.31770770553892647
- },
- "core_cot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.2974368415462532,
- "micro_mean_score": 0.2956217833156672
- },
- "open": {
- "num_eval_tasks": 65,
- "num_eval_samples": 1163,
- "macro_mean_score": 0.4599484231632498,
- "micro_mean_score": 0.4850386930352536
- },
- "overall_score": 0.33795497518277007
- },
- "llava_onevision_7B": {
- "core_noncot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.22409531510496777,
- "micro_mean_score": 0.22238854298563537
- },
- "core_cot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.21362697219149712,
- "micro_mean_score": 0.21073910058505504
- },
- "open": {
- "num_eval_tasks": 65,
- "num_eval_samples": 1163,
- "macro_mean_score": 0.33979975321921935,
- "micro_mean_score": 0.36474634565778147
- },
- "overall_score": 0.23898796555531696
- },
- "InternVL2_76B": {
- "core_noncot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.3502244283768534,
- "micro_mean_score": 0.3456783051732046
- },
- "core_cot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.3562710424410931,
- "micro_mean_score": 0.35129859801162616
- },
- "open": {
- "num_eval_tasks": 65,
- "num_eval_samples": 1163,
- "macro_mean_score": 0.5192997443033639,
- "micro_mean_score": 0.5421324161650903
- },
- "overall_score": 0.3772549347599992
- },
- "InternVL2_8B": {
- "core_noncot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.25956581776451815,
- "micro_mean_score": 0.2546984460483302
- },
- "core_cot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.24090301358258295,
- "micro_mean_score": 0.23819084111520938
- },
- "open": {
- "num_eval_tasks": 65,
- "num_eval_samples": 1165,
- "macro_mean_score": 0.3978571701460552,
- "micro_mean_score": 0.4108583690987125
- },
- "overall_score": 0.2773656948037259
- },
- "MiniCPM_v2.6": {
- "core_noncot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.2287645706203155,
- "micro_mean_score": 0.2249087742955901
- },
- "core_cot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.22955895202146906,
- "micro_mean_score": 0.22560399396899078
- },
- "open": {
- "num_eval_tasks": 65,
- "num_eval_samples": 1163,
- "macro_mean_score": 0.41728623355613875,
- "micro_mean_score": 0.43452278589853827
- },
- "overall_score": 0.2537218694467236
- },
- "Phi-3.5-vision": {
- "core_noncot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.23271251159409778,
- "micro_mean_score": 0.2296262323791101
- },
- "core_cot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.22995297916629392,
- "micro_mean_score": 0.22708502951025372
- },
- "open": {
- "num_eval_tasks": 65,
- "num_eval_samples": 1163,
- "macro_mean_score": 0.3947914647737769,
- "micro_mean_score": 0.42459157351676696
- },
- "overall_score": 0.25357415903306635
- },
- "Pixtral_12B": {
- "core_noncot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.31905695620134694,
- "micro_mean_score": 0.31556607913724777
- },
- "core_cot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.31362045151669854,
- "micro_mean_score": 0.3100986209078182
- },
- "open": {
- "num_eval_tasks": 65,
- "num_eval_samples": 1163,
- "macro_mean_score": 0.4566234428542061,
- "micro_mean_score": 0.4870593293207223
- },
- "overall_score": 0.33676353369131895
- },
- "Llama_3_2_11B": {
- "core_noncot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.10044261716549671,
- "micro_mean_score": 0.09980638766828835
- },
- "core_cot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.15999641916771298,
- "micro_mean_score": 0.15809331016967038
- },
- "open": {
- "num_eval_tasks": 65,
- "num_eval_samples": 1163,
- "macro_mean_score": 0.3173342406187366,
- "micro_mean_score": 0.3487962166809973
- },
- "overall_score": 0.1802478219287358
- },
- "Idefics3": {
- "core_noncot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.11118980301103833,
- "micro_mean_score": 0.11201785633274061
- },
- "core_cot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.08956972487602757,
- "micro_mean_score": 0.08982225274252693
- },
- "open": {
- "num_eval_tasks": 65,
- "num_eval_samples": 1163,
- "macro_mean_score": 0.3210866162255635,
- "micro_mean_score": 0.35649183147033553
- },
- "overall_score": 0.138206224513898
- },
- "Aria": {
- "core_noncot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.30485930718699694,
- "micro_mean_score": 0.3016713629035311
- },
- "core_cot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.289073788209904,
- "micro_mean_score": 0.2859007507765791
- },
- "open": {
- "num_eval_tasks": 65,
- "num_eval_samples": 1163,
- "macro_mean_score": 0.5103725263180767,
- "micro_mean_score": 0.5349957007738607
- },
- "overall_score": 0.3313115037088191
- },
- "NVLM": {
- "core_noncot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.2420528895703979,
- "micro_mean_score": 0.23838419989257642
- },
- "core_cot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.21589726765847422,
- "micro_mean_score": 0.21406043849932396
- },
- "open": {
- "num_eval_tasks": 65,
- "num_eval_samples": 1163,
- "macro_mean_score": 0.3478114310231307,
- "micro_mean_score": 0.3947549441100602
- },
- "overall_score": 0.25566537510391796
- },
- "InternVL2_2B": {
- "core_noncot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.09089701489596874,
- "micro_mean_score": 0.09036328295381871
- },
- "core_cot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.13141974398938763,
- "micro_mean_score": 0.13063500716262516
- },
- "open": {
- "num_eval_tasks": 65,
- "num_eval_samples": 1163,
- "macro_mean_score": 0.23864417043743646,
- "micro_mean_score": 0.24901117798796224
- },
- "overall_score": 0.14522090778963154
- },
- "Qwen2_VL_2B": {
- "core_noncot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.16448220309703876,
- "micro_mean_score": 0.1610710186451323
- },
- "core_cot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.20877163406364055,
- "micro_mean_score": 0.20561526268932287
- },
- "open": {
- "num_eval_tasks": 65,
- "num_eval_samples": 1163,
- "macro_mean_score": 0.3154302566225611,
- "micro_mean_score": 0.33856405846947557
- },
- "overall_score": 0.22249997162072932
- },
- "Aquila_VL_2B": {
- "core_noncot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.16317824309838627,
- "micro_mean_score": 0.16198837245148487
- },
- "core_cot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.159970161379836,
- "micro_mean_score": 0.15844711671722148
- },
- "open": {
- "num_eval_tasks": 65,
- "num_eval_samples": 1163,
- "macro_mean_score": 0.24567572098570653,
- "micro_mean_score": 0.2704213241616509
- },
- "overall_score": 0.17379673035120966
- },
- "Mammoth_VL": {
- "core_noncot": {
- "num_eval_tasks": 440,
- "num_eval_samples": 6539,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.264052880412689,
- "micro_mean_score": 0.2626894374387823
- },
- "core_cot": null,
- "open": {
- "num_eval_tasks": 65,
- "num_eval_samples": 1163,
- "macro_mean_score": 0.37992668750165337,
- "micro_mean_score": 0.40120378331900275
- },
- "overall_score": 0.27896733083008046
- }
-}
\ No newline at end of file
diff --git a/static/eval_results/Default/llava_onevision_72B/summary_results.json b/static/eval_results/Default/llava_onevision_72B/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..2eb71da75405e6141add0c6e95de67741daab5e6
--- /dev/null
+++ b/static/eval_results/Default/llava_onevision_72B/summary_results.json
@@ -0,0 +1,251 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 440,
+ "num_eval_samples": 6539,
+ "macro_mean_score": 0.2974368415462532,
+ "micro_mean_score": 0.2956217833156672
+ },
+ "open": {
+ "num_eval_tasks": 65,
+ "num_eval_samples": 1163,
+ "macro_mean_score": 0.4599484231632498,
+ "micro_mean_score": 0.4850386930352536
+ },
+ "overall_score": 0.31835417383358944
+ },
+ "keyword_stats": {
+ "skills": {
+ "Object Recognition and Classification": {
+ "count": 303,
+ "num_samples": 4755,
+ "tasks": [],
+ "average_score": 0.3305832092026115
+ },
+ "Text Recognition (OCR)": {
+ "count": 137,
+ "num_samples": 2239,
+ "tasks": [],
+ "average_score": 0.2664116432811501
+ },
+ "Language Understanding and Generation": {
+ "count": 154,
+ "num_samples": 2509,
+ "tasks": [],
+ "average_score": 0.3495276153952721
+ },
+ "Scene and Event Understanding": {
+ "count": 154,
+ "num_samples": 2467,
+ "tasks": [],
+ "average_score": 0.39896965542882173
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 109,
+ "num_samples": 1910,
+ "tasks": [],
+ "average_score": 0.2861655413017371
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 51,
+ "num_samples": 855,
+ "tasks": [],
+ "average_score": 0.45858638429470816
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 15,
+ "num_samples": 245,
+ "tasks": [],
+ "average_score": 0.5934010025062657
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 77,
+ "num_samples": 1386,
+ "tasks": [],
+ "average_score": 0.30604427435146236
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 152,
+ "num_samples": 2437,
+ "tasks": [],
+ "average_score": 0.27899574672445293
+ },
+ "Planning and Decision Making": {
+ "count": 37,
+ "num_samples": 577,
+ "tasks": [],
+ "average_score": 0.12433347702554473
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 93,
+ "num_samples": 1517,
+ "tasks": [],
+ "average_score": 0.21351320454567943
+ },
+ "Text-Based Images and Documents": {
+ "count": 82,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.21295992410688594
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 101,
+ "num_samples": 1718,
+ "tasks": [],
+ "average_score": 0.32763074938212144
+ },
+ "Videos": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.39619210332031635
+ },
+ "Artistic and Creative Content": {
+ "count": 32,
+ "num_samples": 541,
+ "tasks": [],
+ "average_score": 0.43323889670054355
+ },
+ "Photographs": {
+ "count": 143,
+ "num_samples": 2248,
+ "tasks": [],
+ "average_score": 0.39984139901797444
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 11,
+ "num_samples": 169,
+ "tasks": [],
+ "average_score": 0.20740773655402334
+ }
+ },
+ "output_format": {
+ "contextual_formatted_text": {
+ "count": 98,
+ "num_samples": 1514,
+ "tasks": [],
+ "average_score": 0.25996561636037274
+ },
+ "structured_output": {
+ "count": 110,
+ "num_samples": 1714,
+ "tasks": [],
+ "average_score": 0.25556145878894343
+ },
+ "exact_text": {
+ "count": 83,
+ "num_samples": 1278,
+ "tasks": [],
+ "average_score": 0.3399610538914775
+ },
+ "numerical_data": {
+ "count": 49,
+ "num_samples": 862,
+ "tasks": [],
+ "average_score": 0.2976261136565818
+ },
+ "open_ended_output": {
+ "count": 80,
+ "num_samples": 1454,
+ "tasks": [],
+ "average_score": 0.42431325255029934
+ },
+ "multiple_choice": {
+ "count": 85,
+ "num_samples": 1363,
+ "tasks": [],
+ "average_score": 0.3580583490549799
+ }
+ },
+ "input_num": {
+ "6-8 images": {
+ "count": 21,
+ "num_samples": 314,
+ "tasks": [],
+ "average_score": 0.19965041572184428
+ },
+ "9-image or more": {
+ "count": 41,
+ "num_samples": 623,
+ "tasks": [],
+ "average_score": 0.22764372137050506
+ },
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.33685775371860216
+ },
+ "video": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.39619210332031635
+ },
+ "4-5 images": {
+ "count": 34,
+ "num_samples": 520,
+ "tasks": [],
+ "average_score": 0.22870674199032645
+ },
+ "2-3 images": {
+ "count": 51,
+ "num_samples": 802,
+ "tasks": [],
+ "average_score": 0.32000636054527115
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 72,
+ "num_samples": 1124,
+ "tasks": [],
+ "average_score": 0.2754908385554327
+ },
+ "Planning": {
+ "count": 78,
+ "num_samples": 1239,
+ "tasks": [],
+ "average_score": 0.1617746235264615
+ },
+ "Coding": {
+ "count": 31,
+ "num_samples": 474,
+ "tasks": [],
+ "average_score": 0.24538794012228551
+ },
+ "Perception": {
+ "count": 145,
+ "num_samples": 2313,
+ "tasks": [],
+ "average_score": 0.37170152595100986
+ },
+ "Metrics": {
+ "count": 20,
+ "num_samples": 309,
+ "tasks": [],
+ "average_score": 0.40489633872843234
+ },
+ "Science": {
+ "count": 29,
+ "num_samples": 574,
+ "tasks": [],
+ "average_score": 0.2903113179276
+ },
+ "Knowledge": {
+ "count": 97,
+ "num_samples": 1605,
+ "tasks": [],
+ "average_score": 0.41431490471877547
+ },
+ "Mathematics": {
+ "count": 33,
+ "num_samples": 547,
+ "tasks": [],
+ "average_score": 0.30623783684939326
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/Default/llava_onevision_72B/task_results.json b/static/eval_results/Default/llava_onevision_72B/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a83dc338e703fdbe458d83b59a8a58a872050a1
--- /dev/null
+++ b/static/eval_results/Default/llava_onevision_72B/task_results.json
@@ -0,0 +1,7756 @@
+[
+ {
+ "name": "face_identity_matching",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "IAM_line_ocr_and_locate",
+ "score": 0.4694874961506745,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_multicamera_tracking_predict_bbox",
+ "score": 0.0013130777246558834,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevr_arithmetic",
+ "score": 0.42105263157894735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.2833333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualdial_visual_dialog_image_guessing",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_object_existence_video",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.10204081632653059,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "star_object_interaction_video",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_content_reasoning",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_similar_scene_retrieval",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cam_traj_to_video_selection",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "photo_sharing_image_retrieval",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "perception_test_video_character_order",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "coco_ood_global_image_retrieval_by_query_property",
+ "score": 0.4063775510204081,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "arc_agi",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "vizwiz_quality_accessment_for_blind",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "av_human_multiview_counting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "emotion_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.7,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.20833333333333334,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.45686454871884236,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.10277361750041909,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "dvqa",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.5647397973618976,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_moving_direction_video",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.4545454545454545,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_cycle",
+ "score": 0.26071428571428573,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "av_vehicle_multiview_counting",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.21063161048981988,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.554,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sta_action_localization_video",
+ "score": 0.3125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.7673845718816646,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_object_shuffle_video",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "video_to_camera_trajectory_retrieval",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "cheapest_flight_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_retrieval",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.48888888888888893,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "muma_theory_of_mind_social_goal",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "clevrer_video_moving_object_property_recognition",
+ "score": 0.875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "action_sequence",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "game_info_retrieval",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "av_view_identification",
+ "score": 0.17777777777777776,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "code_programming_test_advanced",
+ "score": 0.05555555555555555,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "muma_theory_of_mind_belief_of_goal",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "code_programming_extremely_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.12605042016806725,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_fragments_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.6391210526315788,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.30158730158730157,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.5830473772319825,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vlnqa_egocentric_navigation_video",
+ "score": 0.3125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "landmark_check_two_images",
+ "score": 0.8222222222222223,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.6007894736842104,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.529357142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_load_type_prediction_from_plot",
+ "score": 0.4523809523809523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_prediction",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "code_error_line_identification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.78125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.15178571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.496031746031746,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_mc",
+ "score": 0.9473684210526315,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_retrieval",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_video_moving_object_count",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 21,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "algebra",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.8263285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.559142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logo2k_same_type_logo_retrieval",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.21560071069920497,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.4171368421052631,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_path",
+ "score": 0.4017857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.19642857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.047619047619047616,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_video_action_count",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.5777777777777777,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geographic_remote_sensing_land_cover",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.11428571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.014285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_plane_segmentation",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_novel_view_synthesis",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_3d_assembled_quality_understanding",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visual_prediction_rater_semantic_segmentation",
+ "score": 0.29166666666666663,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_panoptic_segmentation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_surface_normal_estimation",
+ "score": 0.5476190476190477,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_depth_estimation",
+ "score": 0.15476190476190474,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_openable_part_segmentation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_abdomen_MRI_organ_recognition",
+ "score": 0.047619047619047616,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_correspondance_in_two_images",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_circle_reasoning",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vln_tegulu_next_step",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_factual_pref",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_polyp_segmentation_single_object_rater",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "recipe_image_ordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_visualization_output_understanding",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 10,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_view",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video_grounding_temporal",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.07748064743138636,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_english_next_step",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "logical_reasoning_2d_folding",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_abdomen_endscopy_organ_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "worldle",
+ "score": 0.1675305866800247,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_keywords_based_retrieval_non_radiology",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sign_language",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_camera_motion_description",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_segments_reordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.007919756956365811,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.8761904761904761,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_spatial",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_match_problem",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "music_sheet_sentiment",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.12244897959183673,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_visual_pref",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.21479662373299105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.35,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "semantic_matching_of_two_images",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.04081632653061224,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_retrieval",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.8421052631578947,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_recommendation",
+ "score": 0.3497448979591837,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "photoshop_operation",
+ "score": 0.2642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.6785714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_homepage_profile",
+ "score": 0.011904761904761904,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vln_hindi_next_step",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "media_recommend_solutions_stackoverflow",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "dish_ingredient_match",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "medical_retrieval_given_surgeon_activity",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Videos",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "google_streetview_direction_understanding",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "music_sheet_name",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "code_translation_easy",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ancient_map_understanding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_2D_views_of_3D_shapes",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "paper_review_acceptance",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "rocks_samples_compare",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_line_reasoning",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_intent_recognition",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "counting",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_solution_compare",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "2d_image_jigsaw_puzzle_easy",
+ "score": 0.18095238095238098,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_content_based_retrieval_radiology",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.15151515151515152,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiview_reasoning_camera_moving",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rocks_samples_identify",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.3406862745098039,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_rating",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_grippers",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "comic_page_ordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_event",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.6019887092978411,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_robot",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "multilingual_news_qa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "move_pos_to_pos_hanoi_4_pole",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting_multi_image",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_translation_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.2619047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_sign_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.3095238095238096,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_author",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.040960740741354244,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_action_recognition",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "functionality_matching_in_different_objects",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "Movie_retrieval_by_actor",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "TV_show_retrieval_by_character",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "Forensic_Detection_of_different_images",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pokemon_3D_recognition",
+ "score": 0.23333333333333334,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_rating",
+ "score": 0.6370339174257883,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_basic",
+ "score": 0.5882352941176471,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "waldo",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_line_sorting",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_eval_dynamic_pref",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.3157894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_Python",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.4117647058823529,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.4000000000000001,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_parasite_detection",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.17647058823529413,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_location",
+ "score": 0.08484848484848483,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "rebus",
+ "score": 0.21739130434782608,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat_position_only_length",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_xor_images",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_images",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_position_only",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_contain_images",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.5952380952380951,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_tiktok",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_notes",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_alipay",
+ "score": 0.35294117647058826,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_amazon",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_instagram",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_ppt",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_leetcode",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_twitter",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_zoom",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_iphone_settings",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_youtube",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_excel",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_word",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.028571428571428574,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.635294117647059,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_T2I_reward",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_motion_matching_3D_real",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "video_motion_matching_real_3D",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ball_cup_swap_3",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.36981292517006803,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.28985260770975046,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_subject",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_control",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_aesthetics",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_mask",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_semantics",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_motion_guided_editing",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "autorater_3d_model_texturing",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_unmask",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.1,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.29411764705882354,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_output_result",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.42105263157894735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_add_tag",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "painting_QA",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.248,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.32142857142857145,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.6571428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.8428571428571431,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.5571428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.6448275862068966,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "doc_vqa",
+ "score": 0.4937499999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "generated_video_artifacts",
+ "score": 0.29375,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "sceneqa_scene_transition_video",
+ "score": 0.29999999999999993,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "nextqa_oe",
+ "score": 0.31052631578947365,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.45714285714285713,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.6931034482758622,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.5214285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.6000000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_content_follow_up",
+ "score": 0.7642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.7310344827586207,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "wikihow_complex_task_completion",
+ "score": 0.4444444444444444,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.6714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_detail_description",
+ "score": 0.4842105263157894,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video2notes",
+ "score": 0.33571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "funqa_unexpected_action_magic_video",
+ "score": 0.5333333333333333,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_qa",
+ "score": 0.6214285714285716,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.4827586206896552,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_summary",
+ "score": 0.5714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.36551724137931035,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_creative_video",
+ "score": 0.24666666666666667,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.6827586206896553,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.4193548387096775,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.4551724137931035,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.6857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.46,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_humor_video",
+ "score": 0.23333333333333334,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video_short_title",
+ "score": 0.6785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.6346153846153846,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.41111111111111115,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "activitynetqa",
+ "score": 0.46842105263157896,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.4071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.6413793103448276,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.8157894736842105,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.5517241379310345,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_writing",
+ "score": 0.42666666666666664,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.10714285714285716,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.4,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.2,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.07857142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.11428571428571432,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.09285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.049999999999999996,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.11428571428571431,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.07142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.40714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_B",
+ "score": 0.905263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.745,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.5700000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_A",
+ "score": 0.905263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.6950000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8631578947368422,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.805,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Spanish",
+ "score": 0.2714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Arabic",
+ "score": 0.33571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Russian",
+ "score": 0.3071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_French",
+ "score": 0.2857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Chinese",
+ "score": 0.3071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/Default/llava_onevision_7B/summary_results.json b/static/eval_results/Default/llava_onevision_7B/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..28bba3867965ba53739f13ae6e96aa8e53be256d
--- /dev/null
+++ b/static/eval_results/Default/llava_onevision_7B/summary_results.json
@@ -0,0 +1,251 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 440,
+ "num_eval_samples": 6539,
+ "macro_mean_score": 0.21362697219149712,
+ "micro_mean_score": 0.21073910058505504
+ },
+ "open": {
+ "num_eval_tasks": 65,
+ "num_eval_samples": 1163,
+ "macro_mean_score": 0.33979975321921935,
+ "micro_mean_score": 0.36474634565778147
+ },
+ "overall_score": 0.2298670331158574
+ },
+ "keyword_stats": {
+ "skills": {
+ "Object Recognition and Classification": {
+ "count": 303,
+ "num_samples": 4755,
+ "tasks": [],
+ "average_score": 0.24537135448488254
+ },
+ "Text Recognition (OCR)": {
+ "count": 137,
+ "num_samples": 2239,
+ "tasks": [],
+ "average_score": 0.1811965364419926
+ },
+ "Language Understanding and Generation": {
+ "count": 154,
+ "num_samples": 2509,
+ "tasks": [],
+ "average_score": 0.24900339991899337
+ },
+ "Scene and Event Understanding": {
+ "count": 154,
+ "num_samples": 2467,
+ "tasks": [],
+ "average_score": 0.29226125591371144
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 109,
+ "num_samples": 1910,
+ "tasks": [],
+ "average_score": 0.18715552665467763
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 51,
+ "num_samples": 855,
+ "tasks": [],
+ "average_score": 0.35893459469741823
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 15,
+ "num_samples": 245,
+ "tasks": [],
+ "average_score": 0.4261779448621554
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 77,
+ "num_samples": 1386,
+ "tasks": [],
+ "average_score": 0.23519232289471675
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 152,
+ "num_samples": 2437,
+ "tasks": [],
+ "average_score": 0.21092208834236795
+ },
+ "Planning and Decision Making": {
+ "count": 37,
+ "num_samples": 577,
+ "tasks": [],
+ "average_score": 0.05120126248868793
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 93,
+ "num_samples": 1517,
+ "tasks": [],
+ "average_score": 0.12311222499137182
+ },
+ "Text-Based Images and Documents": {
+ "count": 82,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.13426264370971033
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 101,
+ "num_samples": 1718,
+ "tasks": [],
+ "average_score": 0.23906185495910173
+ },
+ "Videos": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.2849009636494337
+ },
+ "Artistic and Creative Content": {
+ "count": 32,
+ "num_samples": 541,
+ "tasks": [],
+ "average_score": 0.3190474713712686
+ },
+ "Photographs": {
+ "count": 143,
+ "num_samples": 2248,
+ "tasks": [],
+ "average_score": 0.3199649328875728
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 11,
+ "num_samples": 169,
+ "tasks": [],
+ "average_score": 0.11485315822870823
+ }
+ },
+ "output_format": {
+ "contextual_formatted_text": {
+ "count": 98,
+ "num_samples": 1514,
+ "tasks": [],
+ "average_score": 0.1918463706234823
+ },
+ "structured_output": {
+ "count": 110,
+ "num_samples": 1714,
+ "tasks": [],
+ "average_score": 0.1576113794541456
+ },
+ "exact_text": {
+ "count": 83,
+ "num_samples": 1278,
+ "tasks": [],
+ "average_score": 0.25634975084939915
+ },
+ "numerical_data": {
+ "count": 49,
+ "num_samples": 862,
+ "tasks": [],
+ "average_score": 0.20701788214046113
+ },
+ "open_ended_output": {
+ "count": 80,
+ "num_samples": 1454,
+ "tasks": [],
+ "average_score": 0.3110674582207745
+ },
+ "multiple_choice": {
+ "count": 85,
+ "num_samples": 1363,
+ "tasks": [],
+ "average_score": 0.2780982759930128
+ }
+ },
+ "input_num": {
+ "6-8 images": {
+ "count": 21,
+ "num_samples": 314,
+ "tasks": [],
+ "average_score": 0.15675547996976566
+ },
+ "9-image or more": {
+ "count": 41,
+ "num_samples": 623,
+ "tasks": [],
+ "average_score": 0.1264959299900332
+ },
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.24529675139199647
+ },
+ "video": {
+ "count": 43,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.2849009636494337
+ },
+ "4-5 images": {
+ "count": 34,
+ "num_samples": 520,
+ "tasks": [],
+ "average_score": 0.18774519614511564
+ },
+ "2-3 images": {
+ "count": 51,
+ "num_samples": 802,
+ "tasks": [],
+ "average_score": 0.22945291608261006
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 72,
+ "num_samples": 1124,
+ "tasks": [],
+ "average_score": 0.17072269450773786
+ },
+ "Planning": {
+ "count": 78,
+ "num_samples": 1239,
+ "tasks": [],
+ "average_score": 0.0916060531149872
+ },
+ "Coding": {
+ "count": 31,
+ "num_samples": 474,
+ "tasks": [],
+ "average_score": 0.13896832926719074
+ },
+ "Perception": {
+ "count": 145,
+ "num_samples": 2313,
+ "tasks": [],
+ "average_score": 0.2764008849458886
+ },
+ "Metrics": {
+ "count": 20,
+ "num_samples": 309,
+ "tasks": [],
+ "average_score": 0.3730746617295249
+ },
+ "Science": {
+ "count": 29,
+ "num_samples": 574,
+ "tasks": [],
+ "average_score": 0.24378647619197266
+ },
+ "Knowledge": {
+ "count": 97,
+ "num_samples": 1605,
+ "tasks": [],
+ "average_score": 0.31371933170977295
+ },
+ "Mathematics": {
+ "count": 33,
+ "num_samples": 547,
+ "tasks": [],
+ "average_score": 0.2211309948951352
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/Default/llava_onevision_7B/task_results.json b/static/eval_results/Default/llava_onevision_7B/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca9b89fa27cd44459fa62b05606715ba8cea739a
--- /dev/null
+++ b/static/eval_results/Default/llava_onevision_7B/task_results.json
@@ -0,0 +1,7756 @@
+[
+ {
+ "name": "face_identity_matching",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "IAM_line_ocr_and_locate",
+ "score": 0.18393003523735163,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "av_multicamera_tracking_predict_bbox",
+ "score": 0.034222739980969856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevr_arithmetic",
+ "score": 0.3157894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.16666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualdial_visual_dialog_image_guessing",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_object_existence_video",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.1530612244897959,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "star_object_interaction_video",
+ "score": 0.3125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_content_reasoning",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_similar_scene_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cam_traj_to_video_selection",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "photo_sharing_image_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "perception_test_video_character_order",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "coco_ood_global_image_retrieval_by_query_property",
+ "score": 0.23841991341991345,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "arc_agi",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "vizwiz_quality_accessment_for_blind",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "av_human_multiview_counting",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "emotion_recognition",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.08333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.3180487534871889,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.07094776572587472,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "dvqa",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.27730527983349595,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_moving_direction_video",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.3246753246753246,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_cycle",
+ "score": 0.33035714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "av_vehicle_multiview_counting",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.21124366266226283,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.3466666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sta_action_localization_video",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.848854419078294,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_object_shuffle_video",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "video_to_camera_trajectory_retrieval",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "cheapest_flight_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.3111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "muma_theory_of_mind_social_goal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "clevrer_video_moving_object_property_recognition",
+ "score": 0.6875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "action_sequence",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "game_info_retrieval",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "av_view_identification",
+ "score": 0.12222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "code_programming_test_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "muma_theory_of_mind_belief_of_goal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "code_programming_extremely_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.04201680672268908,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.5789473684210527,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_fragments_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.39,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.3174603174603175,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.19287649674503982,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vlnqa_egocentric_navigation_video",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "landmark_check_two_images",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.4274736842105263,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.23392857142857157,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_load_type_prediction_from_plot",
+ "score": 0.19047619047619047,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.021428571428571432,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "action_prediction",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "code_error_line_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.13392857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.24206349206349206,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_mc",
+ "score": 0.8421052631578947,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_retrieval",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "clevrer_video_moving_object_count",
+ "score": 0.19047619047619047,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 21,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "algebra",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.36628571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.3349285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logo2k_same_type_logo_retrieval",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.04435358813204807,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.3337526315789474,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_hamiltonian_path",
+ "score": 0.4107142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.42105263157894735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "perception_test_video_action_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "video"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.13392857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geographic_remote_sensing_land_cover",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.16666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.014285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.11428571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.042857142857142864,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.028571428571428574,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visual_prediction_rater_plane_segmentation",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_novel_view_synthesis",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_3d_assembled_quality_understanding",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "visual_prediction_rater_semantic_segmentation",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_panoptic_segmentation",
+ "score": 0.09523809523809523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_surface_normal_estimation",
+ "score": 0.23809523809523808,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_depth_estimation",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_prediction_rater_openable_part_segmentation",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "visual_correspondance_in_two_images",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_circle_reasoning",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vln_tegulu_next_step",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_factual_pref",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_polyp_segmentation_single_object_rater",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "recipe_image_ordering",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_visualization_output_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 10,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_view",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "video_grounding_temporal",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.09455782312925169,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_english_next_step",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "logical_reasoning_2d_folding",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_hard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "worldle",
+ "score": 0.007154015950258795,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_keywords_based_retrieval_non_radiology",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sign_language",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_camera_motion_description",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "video"
+ },
+ {
+ "name": "video_segments_reordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.8642857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_grounding_spatial",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_match_problem",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "music_sheet_sentiment",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.02040816326530612,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_eval_visual_pref",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.09394266841882398,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.45,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "semantic_matching_of_two_images",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.10204081632653061,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_visual_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_retrieval",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.6842105263157895,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_recommendation",
+ "score": 0.3779761904761904,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "photoshop_operation",
+ "score": 0.19047619047619047,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_homepage_profile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "vln_hindi_next_step",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "media_recommend_solutions_stackoverflow",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "dish_ingredient_match",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "google_streetview_direction_understanding",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "music_sheet_name",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "code_translation_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ancient_map_understanding",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_2D_views_of_3D_shapes",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "paper_review_acceptance",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "rocks_samples_compare",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_line_reasoning",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_intent_recognition",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "video"
+ },
+ {
+ "name": "counting",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.38095238095238093,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_solution_compare",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "2d_image_jigsaw_puzzle_easy",
+ "score": 0.2678571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_content_based_retrieval_radiology",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.10101010101010101,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiview_reasoning_camera_moving",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rocks_samples_identify",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "booking_web_rating",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "comic_page_ordering",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_event",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.552269841333154,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_robot",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "multilingual_news_qa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "move_pos_to_pos_hanoi_4_pole",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting_multi_image",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "code_translation_advanced",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "google_streetview_circle_sorting",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_sign_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.2380952380952381,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_author",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_action_recognition",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "video"
+ },
+ {
+ "name": "functionality_matching_in_different_objects",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "Movie_retrieval_by_actor",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "TV_show_retrieval_by_character",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "Forensic_Detection_of_different_images",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pokemon_3D_recognition",
+ "score": 0.1,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_review_rating",
+ "score": 0.6449456155428795,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "numerical_data",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TRANCE_physics_reasoning_basic",
+ "score": 0.23529411764705882,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "waldo",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "google_streetview_line_sorting",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_eval_dynamic_pref",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.13157894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_translation_Python",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.23529411764705882,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_parasite_detection",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.17647058823529413,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vln_identify_location",
+ "score": 0.14545454545454545,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "rebus",
+ "score": 0.043478260869565216,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat_position_only_length",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_multi_contain_repeat",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_xor_images",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_position_images",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_multi_contain_position_only",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_contain_images",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.2619047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_interactive_operations_tiktok",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_notes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_alipay",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_instagram",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_ppt",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_leetcode",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_twitter",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_zoom",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_iphone_settings",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_youtube",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_excel",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "app_interactive_operations_word",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.0071428571428571435,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6823529411764707,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_T2I_reward",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.5952380952380951,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_motion_matching_3D_real",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "video_motion_matching_real_3D",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ball_cup_swap_3",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.19047619047619047,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.21116780045351471,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_subject",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_control",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_aesthetics",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_mask",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_semantics",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_motion_guided_editing",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "autorater_3d_model_texturing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Metrics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "autorater_unmask",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.1,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_abdomen_MRI_organ_recognition",
+ "score": 0.09523809523809523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_abdomen_endscopy_organ_recognition",
+ "score": 0.047619047619047616,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_retrieval_given_surgeon_activity",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Videos",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "video"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_output_result",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.42105263157894735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_add_tag",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "painting_QA",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.5620689655172414,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.39285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.3689655172413792,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.1482758620689655,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.14285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.3111111111111111,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.65,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_summary",
+ "score": 0.25000000000000006,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "funqa_unexpected_action_magic_video",
+ "score": 0.3866666666666666,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "paper_review_writing",
+ "score": 0.12666666666666665,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "activitynetqa",
+ "score": 0.38947368421052636,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.16399999999999998,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "generated_video_artifacts",
+ "score": 0.10000000000000002,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.4285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "wikihow_complex_task_completion",
+ "score": 0.36666666666666664,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "9-image or more"
+ },
+ {
+ "name": "video_detail_description",
+ "score": 0.21052631578947367,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "funqa_unexpected_action_creative_video",
+ "score": 0.23333333333333336,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.7736842105263159,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.45000000000000007,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_qa",
+ "score": 0.7,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.5206896551724137,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.23571428571428577,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nextqa_oe",
+ "score": 0.30526315789473674,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.29310344827586204,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.5586206896551724,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_short_title",
+ "score": 0.3785714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "video2notes",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.14285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.5500000000000002,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "doc_vqa",
+ "score": 0.5125000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "4-5 images"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.6714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.42666666666666664,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "video_content_follow_up",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Videos",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.5129032258064516,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.4482758620689655,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.293103448275862,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.3448275862068966,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funqa_unexpected_action_humor_video",
+ "score": 0.24666666666666667,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Videos",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.5275862068965516,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.692857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.21428571428571425,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.25,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.5050000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.6450000000000002,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_B",
+ "score": 0.905263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.6599999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visual_order_A",
+ "score": 0.905263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "2-3 images"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.71,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.910526315789474,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.07142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.12142857142857146,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.4214285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Spanish",
+ "score": 0.08571428571428573,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Russian",
+ "score": 0.06428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Arabic",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_Chinese",
+ "score": 0.19999999999999998,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_manual_explanation_scooter_French",
+ "score": 0.14285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "6-8 images"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.014285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.021428571428571432,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.028571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.014285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.0071428571428571435,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "sceneqa_scene_transition_video",
+ "score": 0.2714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Videos",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "video"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/SI/Aquila_VL_2B/summary_results.json b/static/eval_results/SI/Aquila_VL_2B/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..ff246bf1cd8585833334967628ff5f37f092ebf6
--- /dev/null
+++ b/static/eval_results/SI/Aquila_VL_2B/summary_results.json
@@ -0,0 +1,219 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 273,
+ "num_eval_samples": 4116,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.20770364903712493,
+ "micro_mean_score": 0.20333142638522636,
+ "missing_tasks": []
+ },
+ "open": {
+ "num_eval_tasks": 42,
+ "num_eval_samples": 813,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.31474202723571276,
+ "micro_mean_score": 0.3326568265682657,
+ "missing_tasks": []
+ },
+ "overall_score": 0.22197543279693666
+ },
+ "keyword_stats": {
+ "skills": {
+ "Text Recognition (OCR)": {
+ "count": 101,
+ "num_samples": 1687,
+ "tasks": [],
+ "average_score": 0.17480107496737848
+ },
+ "Object Recognition and Classification": {
+ "count": 172,
+ "num_samples": 2714,
+ "tasks": [],
+ "average_score": 0.2374462987863378
+ },
+ "Scene and Event Understanding": {
+ "count": 60,
+ "num_samples": 1004,
+ "tasks": [],
+ "average_score": 0.3521969849344277
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 91,
+ "num_samples": 1630,
+ "tasks": [],
+ "average_score": 0.19504930283108274
+ },
+ "Language Understanding and Generation": {
+ "count": 102,
+ "num_samples": 1713,
+ "tasks": [],
+ "average_score": 0.2521179990443663
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 46,
+ "num_samples": 897,
+ "tasks": [],
+ "average_score": 0.20221672149607509
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 78,
+ "num_samples": 1273,
+ "tasks": [],
+ "average_score": 0.18502360430789122
+ },
+ "Planning and Decision Making": {
+ "count": 23,
+ "num_samples": 356,
+ "tasks": [],
+ "average_score": 0.0625675073438388
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 38,
+ "num_samples": 654,
+ "tasks": [],
+ "average_score": 0.3826225373137124
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 10,
+ "num_samples": 170,
+ "tasks": [],
+ "average_score": 0.6020225563909773
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 67,
+ "num_samples": 1123,
+ "tasks": [],
+ "average_score": 0.11601893140078427
+ },
+ "Text-Based Images and Documents": {
+ "count": 53,
+ "num_samples": 847,
+ "tasks": [],
+ "average_score": 0.11430966292465267
+ },
+ "Photographs": {
+ "count": 83,
+ "num_samples": 1315,
+ "tasks": [],
+ "average_score": 0.3533180891172854
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 88,
+ "num_samples": 1524,
+ "tasks": [],
+ "average_score": 0.2248398559924241
+ },
+ "Artistic and Creative Content": {
+ "count": 22,
+ "num_samples": 389,
+ "tasks": [],
+ "average_score": 0.3078950207372175
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 2,
+ "num_samples": 30,
+ "tasks": [],
+ "average_score": 0.10279080594456047
+ }
+ },
+ "output_format": {
+ "structured_output": {
+ "count": 72,
+ "num_samples": 1121,
+ "tasks": [],
+ "average_score": 0.13944236147744013
+ },
+ "contextual_formatted_text": {
+ "count": 63,
+ "num_samples": 975,
+ "tasks": [],
+ "average_score": 0.1772030496280578
+ },
+ "exact_text": {
+ "count": 57,
+ "num_samples": 880,
+ "tasks": [],
+ "average_score": 0.1884228017877996
+ },
+ "numerical_data": {
+ "count": 39,
+ "num_samples": 694,
+ "tasks": [],
+ "average_score": 0.23519563962981577
+ },
+ "open_ended_output": {
+ "count": 51,
+ "num_samples": 991,
+ "tasks": [],
+ "average_score": 0.28092356180071465
+ },
+ "multiple_choice": {
+ "count": 33,
+ "num_samples": 567,
+ "tasks": [],
+ "average_score": 0.43875114784205704
+ }
+ },
+ "input_num": {
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.2219754327969366
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 41,
+ "num_samples": 644,
+ "tasks": [],
+ "average_score": 0.08500232938689507
+ },
+ "Planning": {
+ "count": 44,
+ "num_samples": 714,
+ "tasks": [],
+ "average_score": 0.08421801129956362
+ },
+ "Coding": {
+ "count": 16,
+ "num_samples": 244,
+ "tasks": [],
+ "average_score": 0.23446107609710548
+ },
+ "Perception": {
+ "count": 82,
+ "num_samples": 1321,
+ "tasks": [],
+ "average_score": 0.3004030430829456
+ },
+ "Metrics": {
+ "count": 3,
+ "num_samples": 45,
+ "tasks": [],
+ "average_score": 0.39206349206349206
+ },
+ "Science": {
+ "count": 22,
+ "num_samples": 469,
+ "tasks": [],
+ "average_score": 0.2897054521388083
+ },
+ "Knowledge": {
+ "count": 77,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.2736043135287443
+ },
+ "Mathematics": {
+ "count": 30,
+ "num_samples": 497,
+ "tasks": [],
+ "average_score": 0.19099680045595863
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/SI/Aquila_VL_2B/task_results.json b/static/eval_results/SI/Aquila_VL_2B/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..47f26fa305c99a0ba9c008fc00a7e43c18f187ce
--- /dev/null
+++ b/static/eval_results/SI/Aquila_VL_2B/task_results.json
@@ -0,0 +1,2207 @@
+[
+ {
+ "name": "location_vqa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.08163265306122448,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.29411764705882354,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.08163265306122448,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.20902382802982977,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.2380952380952381,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.3,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.024221420767560734,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_execution",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.08027210884353742,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.7023809523809523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.0196078431372549,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counting",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.10101010101010101,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.014285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "rebus",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.11764705882352941,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.008685714285714291,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.02728669632537238,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.28827816841022125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.13157894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.38095238095238093,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.23691578947368427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.5319681933751316,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.12301587301587302,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.848854419078294,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.08778571428571448,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.16964285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "dvqa",
+ "score": 0.5789473684210527,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.2531585786626469,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.2272727272727272,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.2795714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "algebra",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "iconqa",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.18333333333333332,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.3717857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.5177142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.11224489795918366,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.041666666666666664,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.32,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.16071428571428573,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.0627244690319781,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.22222222222222218,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.03361344537815126,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_parity",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.6842105263157895,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.34842105263157896,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.28888888888888886,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.4186666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "figureqa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.5224414005921536,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.3210526315789474,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.0071428571428571435,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.5476190476190477,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.611764705882353,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.05,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.16326530612244897,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.05555555555555555,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.028571428571428574,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.042857142857142864,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.014285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.4344827586206897,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.37857142857142856,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.26206896551724135,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "iq_test",
+ "score": 0.20344827586206893,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.42857142857142866,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.1285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.4448275862068965,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.3444444444444444,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.23103448275862065,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.44482758620689644,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.3931034482758621,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.2413793103448275,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.36666666666666664,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.3357142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.22758620689655168,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.29999999999999993,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.1857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.5269230769230768,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.6071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.09,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.35,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.4451612903225806,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.44482758620689666,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.6631578947368422,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.32857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.2857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.0071428571428571435,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.03571428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.1928571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.15000000000000005,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.41428571428571426,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.06428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8473684210526317,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.5850000000000002,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.5999999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.4499999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.6799999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/SI/Aria/summary_results.json b/static/eval_results/SI/Aria/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..5648c2026d713e85a8b3c03c640ec9f3a4d53c86
--- /dev/null
+++ b/static/eval_results/SI/Aria/summary_results.json
@@ -0,0 +1,219 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 273,
+ "num_eval_samples": 4116,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.3178882776147889,
+ "micro_mean_score": 0.3101511832828904,
+ "missing_tasks": []
+ },
+ "open": {
+ "num_eval_tasks": 42,
+ "num_eval_samples": 813,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.5137437248005172,
+ "micro_mean_score": 0.5472939729397295,
+ "missing_tasks": []
+ },
+ "overall_score": 0.34400233723955265
+ },
+ "keyword_stats": {
+ "skills": {
+ "Text Recognition (OCR)": {
+ "count": 101,
+ "num_samples": 1687,
+ "tasks": [],
+ "average_score": 0.3653361644690575
+ },
+ "Object Recognition and Classification": {
+ "count": 172,
+ "num_samples": 2714,
+ "tasks": [],
+ "average_score": 0.33433893000455434
+ },
+ "Scene and Event Understanding": {
+ "count": 60,
+ "num_samples": 1004,
+ "tasks": [],
+ "average_score": 0.49083567506460973
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 91,
+ "num_samples": 1630,
+ "tasks": [],
+ "average_score": 0.300830802045758
+ },
+ "Language Understanding and Generation": {
+ "count": 102,
+ "num_samples": 1713,
+ "tasks": [],
+ "average_score": 0.40684369400912745
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 46,
+ "num_samples": 897,
+ "tasks": [],
+ "average_score": 0.3401734439719901
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 78,
+ "num_samples": 1273,
+ "tasks": [],
+ "average_score": 0.22595636868728874
+ },
+ "Planning and Decision Making": {
+ "count": 23,
+ "num_samples": 356,
+ "tasks": [],
+ "average_score": 0.07560632809892315
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 38,
+ "num_samples": 654,
+ "tasks": [],
+ "average_score": 0.5240018518464876
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 10,
+ "num_samples": 170,
+ "tasks": [],
+ "average_score": 0.7129097744360902
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 67,
+ "num_samples": 1123,
+ "tasks": [],
+ "average_score": 0.27807228404309764
+ },
+ "Text-Based Images and Documents": {
+ "count": 53,
+ "num_samples": 847,
+ "tasks": [],
+ "average_score": 0.259572791833904
+ },
+ "Photographs": {
+ "count": 83,
+ "num_samples": 1315,
+ "tasks": [],
+ "average_score": 0.45572004760273754
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 88,
+ "num_samples": 1524,
+ "tasks": [],
+ "average_score": 0.3300885226603808
+ },
+ "Artistic and Creative Content": {
+ "count": 22,
+ "num_samples": 389,
+ "tasks": [],
+ "average_score": 0.40912566596786665
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 2,
+ "num_samples": 30,
+ "tasks": [],
+ "average_score": 0.04960831797041802
+ }
+ },
+ "output_format": {
+ "structured_output": {
+ "count": 72,
+ "num_samples": 1121,
+ "tasks": [],
+ "average_score": 0.3227895527307711
+ },
+ "contextual_formatted_text": {
+ "count": 63,
+ "num_samples": 975,
+ "tasks": [],
+ "average_score": 0.3053148323646246
+ },
+ "exact_text": {
+ "count": 57,
+ "num_samples": 880,
+ "tasks": [],
+ "average_score": 0.2579833154471113
+ },
+ "numerical_data": {
+ "count": 39,
+ "num_samples": 694,
+ "tasks": [],
+ "average_score": 0.3082165471908181
+ },
+ "open_ended_output": {
+ "count": 51,
+ "num_samples": 991,
+ "tasks": [],
+ "average_score": 0.45805038774421686
+ },
+ "multiple_choice": {
+ "count": 33,
+ "num_samples": 567,
+ "tasks": [],
+ "average_score": 0.4787572696663607
+ }
+ },
+ "input_num": {
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.3440023372395526
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 41,
+ "num_samples": 644,
+ "tasks": [],
+ "average_score": 0.33746818901184633
+ },
+ "Planning": {
+ "count": 44,
+ "num_samples": 714,
+ "tasks": [],
+ "average_score": 0.10860172719687727
+ },
+ "Coding": {
+ "count": 16,
+ "num_samples": 244,
+ "tasks": [],
+ "average_score": 0.38003253384687213
+ },
+ "Perception": {
+ "count": 82,
+ "num_samples": 1321,
+ "tasks": [],
+ "average_score": 0.4433718463877228
+ },
+ "Metrics": {
+ "count": 3,
+ "num_samples": 45,
+ "tasks": [],
+ "average_score": 0.4142857142857143
+ },
+ "Science": {
+ "count": 22,
+ "num_samples": 469,
+ "tasks": [],
+ "average_score": 0.3496496998103286
+ },
+ "Knowledge": {
+ "count": 77,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.4097428531166082
+ },
+ "Mathematics": {
+ "count": 30,
+ "num_samples": 497,
+ "tasks": [],
+ "average_score": 0.22745674367681176
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/SI/Aria/task_results.json b/static/eval_results/SI/Aria/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..c1be1ac46152552e5921e47545e02af483a31143
--- /dev/null
+++ b/static/eval_results/SI/Aria/task_results.json
@@ -0,0 +1,2207 @@
+[
+ {
+ "name": "location_vqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.16326530612244897,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.23529411764705882,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.6785714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.08163265306122448,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.4659892098786556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.5789473684210527,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.02157313400640287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_execution",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.042857142857142864,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.9404761904761905,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.317156862745098,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counting",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.12121212121212122,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.17142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_comparison",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "rebus",
+ "score": 0.21739130434782608,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.041666666666666664,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.35294117647058826,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.001383202390173732,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.2599548305299542,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.18421052631578946,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.7777777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.5238095238095238,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.45596842105263163,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.5841053655504803,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.5317460317460317,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.848854419078294,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.13450000000000023,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.13156684963502452,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.3482142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "dvqa",
+ "score": 0.8947368421052632,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.33700342022200497,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.8506493506493505,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.7401428571428569,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "algebra",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "iconqa",
+ "score": 0.42105263157894735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.15,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.5095714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.5952380952380952,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.9448142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.5204081632653061,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.5642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.66,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.5982142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.027788064512264614,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.6984126984126985,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.8865546218487397,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_parity",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.78125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.7777777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.6842105263157895,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.6926315789473684,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.21521614907043682,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.3157894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.6946666666666669,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "figureqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.5392040026539403,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.47489473684210526,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.3,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.09285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.47619047619047616,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6705882352941178,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.05,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.11764705882352941,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.25212585034013607,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.1272675736961451,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.09999999999999999,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.09999999999999999,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.042857142857142864,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.789655172413793,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.45,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.6517241379310345,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.8571428571428573,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.35000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.5333333333333334,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.7517241379310344,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.7103448275862071,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "iq_test",
+ "score": 0.4517241379310344,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.7724137931034484,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.7793103448275864,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.5724137931034483,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.42666666666666664,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.557142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.6000000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.34285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.2714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.7307692307692306,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.6214285714285716,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.21000000000000005,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.6142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.44838709677419364,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.6965517241379309,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.805263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.6642857142857144,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.4071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.3214285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.2357142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.43571428571428567,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.4,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.3,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.5428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.10714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.43571428571428567,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.10714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.910526315789474,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.7750000000000002,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.6950000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.45,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.6950000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/SI/Claude_3.5/summary_results.json b/static/eval_results/SI/Claude_3.5/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..f8d28fc504e0065f3389cbbbf63b00505e1bcc62
--- /dev/null
+++ b/static/eval_results/SI/Claude_3.5/summary_results.json
@@ -0,0 +1,215 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 273,
+ "num_eval_samples": 4116,
+ "macro_mean_score": 0.520276385877485,
+ "micro_mean_score": 0.5148202137998056
+ },
+ "open": {
+ "num_eval_tasks": 42,
+ "num_eval_samples": 813,
+ "macro_mean_score": 0.6479684260295507,
+ "micro_mean_score": 0.6801968019680197
+ },
+ "overall_score": 0.5373019912310938
+ },
+ "keyword_stats": {
+ "skills": {
+ "Text Recognition (OCR)": {
+ "count": 101,
+ "num_samples": 1687,
+ "tasks": [],
+ "average_score": 0.6192941518442948
+ },
+ "Object Recognition and Classification": {
+ "count": 172,
+ "num_samples": 2714,
+ "tasks": [],
+ "average_score": 0.5499261524919171
+ },
+ "Scene and Event Understanding": {
+ "count": 60,
+ "num_samples": 1004,
+ "tasks": [],
+ "average_score": 0.636886763741019
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 91,
+ "num_samples": 1630,
+ "tasks": [],
+ "average_score": 0.5044379729567133
+ },
+ "Language Understanding and Generation": {
+ "count": 102,
+ "num_samples": 1713,
+ "tasks": [],
+ "average_score": 0.5757222503903228
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 46,
+ "num_samples": 897,
+ "tasks": [],
+ "average_score": 0.530309401925396
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 78,
+ "num_samples": 1273,
+ "tasks": [],
+ "average_score": 0.4511182385296208
+ },
+ "Planning and Decision Making": {
+ "count": 23,
+ "num_samples": 356,
+ "tasks": [],
+ "average_score": 0.19196633042767672
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 38,
+ "num_samples": 654,
+ "tasks": [],
+ "average_score": 0.6017116084931068
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 10,
+ "num_samples": 170,
+ "tasks": [],
+ "average_score": 0.7033233082706767
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 67,
+ "num_samples": 1123,
+ "tasks": [],
+ "average_score": 0.54981020669637
+ },
+ "Text-Based Images and Documents": {
+ "count": 53,
+ "num_samples": 847,
+ "tasks": [],
+ "average_score": 0.4753194125515341
+ },
+ "Photographs": {
+ "count": 83,
+ "num_samples": 1315,
+ "tasks": [],
+ "average_score": 0.5314705050989759
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 88,
+ "num_samples": 1524,
+ "tasks": [],
+ "average_score": 0.5589506892621444
+ },
+ "Artistic and Creative Content": {
+ "count": 22,
+ "num_samples": 389,
+ "tasks": [],
+ "average_score": 0.6014068374421209
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 2,
+ "num_samples": 30,
+ "tasks": [],
+ "average_score": 0.34512576094802216
+ }
+ },
+ "output_format": {
+ "structured_output": {
+ "count": 72,
+ "num_samples": 1121,
+ "tasks": [],
+ "average_score": 0.5556080592390198
+ },
+ "contextual_formatted_text": {
+ "count": 63,
+ "num_samples": 975,
+ "tasks": [],
+ "average_score": 0.5072889926389097
+ },
+ "exact_text": {
+ "count": 57,
+ "num_samples": 880,
+ "tasks": [],
+ "average_score": 0.5112348724553849
+ },
+ "numerical_data": {
+ "count": 39,
+ "num_samples": 694,
+ "tasks": [],
+ "average_score": 0.4712835541311676
+ },
+ "open_ended_output": {
+ "count": 51,
+ "num_samples": 991,
+ "tasks": [],
+ "average_score": 0.5769294912151234
+ },
+ "multiple_choice": {
+ "count": 33,
+ "num_samples": 567,
+ "tasks": [],
+ "average_score": 0.6164633346451529
+ }
+ },
+ "input_num": {
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.5373019912310933
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 41,
+ "num_samples": 644,
+ "tasks": [],
+ "average_score": 0.6692574633083122
+ },
+ "Planning": {
+ "count": 44,
+ "num_samples": 714,
+ "tasks": [],
+ "average_score": 0.315623741632974
+ },
+ "Coding": {
+ "count": 16,
+ "num_samples": 244,
+ "tasks": [],
+ "average_score": 0.6124985410830999
+ },
+ "Perception": {
+ "count": 82,
+ "num_samples": 1321,
+ "tasks": [],
+ "average_score": 0.6061759059165749
+ },
+ "Metrics": {
+ "count": 3,
+ "num_samples": 45,
+ "tasks": [],
+ "average_score": 0.4174603174603175
+ },
+ "Science": {
+ "count": 22,
+ "num_samples": 469,
+ "tasks": [],
+ "average_score": 0.5134329832846579
+ },
+ "Knowledge": {
+ "count": 77,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.5401030980230185
+ },
+ "Mathematics": {
+ "count": 30,
+ "num_samples": 497,
+ "tasks": [],
+ "average_score": 0.4760293511799448
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/SI/Claude_3.5/task_results.json b/static/eval_results/SI/Claude_3.5/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..c8339ec0de99b6d9e20d3247591052dc6ab7fd8c
--- /dev/null
+++ b/static/eval_results/SI/Claude_3.5/task_results.json
@@ -0,0 +1,4818 @@
+[
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.47189890122171807,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.8928571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.6666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.9642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.7172619047619049,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.7777777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.06698805429719713,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.42424242424242425,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0035714285714285718,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.5495098039215687,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.07140372068949602,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.5487385867546344,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.8235294117647058,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.9583333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.3157894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.7448979591836732,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.6787142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.7894736842105263,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_comparison",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.7321428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.5416666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.22448979591836735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.09714285714285713,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rebus",
+ "score": 0.5217391304347826,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.23529411764705882,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.9747899159663866,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.8250714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.6207368421052633,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.2688508092335989,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.7853333333333332,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.7053571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.8303571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.6904761904761906,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.84375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.7698412698412698,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.7095421052631579,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.5565966568582713,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.04739437903890144,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.5987447167547407,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "algebra",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.5753130452443872,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.9087301587301589,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.9415584415584416,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.333520279485717,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.7636842105263157,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.7071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.5531252543894322,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.8095238095238094,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.9047619047619049,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.15,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.7714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.82,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.9841571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.3397142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "dvqa",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.5346938775510204,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.4522108843537415,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.5529411764705884,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.8235294117647058,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.8142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.7,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.5857142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.7,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.3157894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.8450000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.2857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.7222222222222222,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.8142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.258,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.6928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.8473684210526317,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.5357142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.4928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.8931034482758619,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.882758620689655,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.8551724137931034,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.8827586206896549,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.3285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.7307692307692307,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.7928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.5866666666666666,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.45806451612903226,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.6482758620689654,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.8931034482758619,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.6499999999999998,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.7517241379310345,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.6931034482758621,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.8310344827586205,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.8551724137931035,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.9357142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.65,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.7071428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.9349999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.8850000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.8100000000000002,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8789473684210528,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.33571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.3071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.4142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.4928571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.4714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.4357142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.5785714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.5714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.47857142857142854,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/SI/Claude_3.5_new/summary_results.json b/static/eval_results/SI/Claude_3.5_new/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..3c88676ad4f2fab27160f4dcc3b8dfe2136a9e2d
--- /dev/null
+++ b/static/eval_results/SI/Claude_3.5_new/summary_results.json
@@ -0,0 +1,215 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 273,
+ "num_eval_samples": 4116,
+ "macro_mean_score": 0.5462752278980763,
+ "micro_mean_score": 0.5417881438289601
+ },
+ "open": {
+ "num_eval_tasks": 42,
+ "num_eval_samples": 813,
+ "macro_mean_score": 0.6764020657053476,
+ "micro_mean_score": 0.6924969249692496
+ },
+ "overall_score": 0.5636254729390457
+ },
+ "keyword_stats": {
+ "skills": {
+ "Text Recognition (OCR)": {
+ "count": 101,
+ "num_samples": 1687,
+ "tasks": [],
+ "average_score": 0.6242355223474262
+ },
+ "Object Recognition and Classification": {
+ "count": 172,
+ "num_samples": 2714,
+ "tasks": [],
+ "average_score": 0.583387314927874
+ },
+ "Scene and Event Understanding": {
+ "count": 60,
+ "num_samples": 1004,
+ "tasks": [],
+ "average_score": 0.6507240054983652
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 91,
+ "num_samples": 1630,
+ "tasks": [],
+ "average_score": 0.5171075478248572
+ },
+ "Language Understanding and Generation": {
+ "count": 102,
+ "num_samples": 1713,
+ "tasks": [],
+ "average_score": 0.6234123112506059
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 46,
+ "num_samples": 897,
+ "tasks": [],
+ "average_score": 0.5426169039575065
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 78,
+ "num_samples": 1273,
+ "tasks": [],
+ "average_score": 0.48795977188332873
+ },
+ "Planning and Decision Making": {
+ "count": 23,
+ "num_samples": 356,
+ "tasks": [],
+ "average_score": 0.22440221381985706
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 38,
+ "num_samples": 654,
+ "tasks": [],
+ "average_score": 0.6433122980573076
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 10,
+ "num_samples": 170,
+ "tasks": [],
+ "average_score": 0.6839924812030076
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 67,
+ "num_samples": 1123,
+ "tasks": [],
+ "average_score": 0.574168555556774
+ },
+ "Text-Based Images and Documents": {
+ "count": 53,
+ "num_samples": 847,
+ "tasks": [],
+ "average_score": 0.4705509892153899
+ },
+ "Photographs": {
+ "count": 83,
+ "num_samples": 1315,
+ "tasks": [],
+ "average_score": 0.5838312144672865
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 88,
+ "num_samples": 1524,
+ "tasks": [],
+ "average_score": 0.5899091882733952
+ },
+ "Artistic and Creative Content": {
+ "count": 22,
+ "num_samples": 389,
+ "tasks": [],
+ "average_score": 0.5927094432064197
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 2,
+ "num_samples": 30,
+ "tasks": [],
+ "average_score": 0.3619606028475468
+ }
+ },
+ "output_format": {
+ "structured_output": {
+ "count": 72,
+ "num_samples": 1121,
+ "tasks": [],
+ "average_score": 0.5638133905687104
+ },
+ "contextual_formatted_text": {
+ "count": 63,
+ "num_samples": 975,
+ "tasks": [],
+ "average_score": 0.5249488326690246
+ },
+ "exact_text": {
+ "count": 57,
+ "num_samples": 880,
+ "tasks": [],
+ "average_score": 0.5300876558354416
+ },
+ "numerical_data": {
+ "count": 39,
+ "num_samples": 694,
+ "tasks": [],
+ "average_score": 0.5106873710119535
+ },
+ "open_ended_output": {
+ "count": 51,
+ "num_samples": 991,
+ "tasks": [],
+ "average_score": 0.6409616762702612
+ },
+ "multiple_choice": {
+ "count": 33,
+ "num_samples": 567,
+ "tasks": [],
+ "average_score": 0.6380252743889108
+ }
+ },
+ "input_num": {
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.5636254729390459
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 41,
+ "num_samples": 644,
+ "tasks": [],
+ "average_score": 0.6633000290867174
+ },
+ "Planning": {
+ "count": 44,
+ "num_samples": 714,
+ "tasks": [],
+ "average_score": 0.3511145464456188
+ },
+ "Coding": {
+ "count": 16,
+ "num_samples": 244,
+ "tasks": [],
+ "average_score": 0.565344887955182
+ },
+ "Perception": {
+ "count": 82,
+ "num_samples": 1321,
+ "tasks": [],
+ "average_score": 0.6465631513465354
+ },
+ "Metrics": {
+ "count": 3,
+ "num_samples": 45,
+ "tasks": [],
+ "average_score": 0.6285714285714286
+ },
+ "Science": {
+ "count": 22,
+ "num_samples": 469,
+ "tasks": [],
+ "average_score": 0.5580232103280633
+ },
+ "Knowledge": {
+ "count": 77,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.5737128945237007
+ },
+ "Mathematics": {
+ "count": 30,
+ "num_samples": 497,
+ "tasks": [],
+ "average_score": 0.4831956110227109
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/SI/Claude_3.5_new/task_results.json b/static/eval_results/SI/Claude_3.5_new/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..264152a89dca9bb2f8d943620c545b74d2e3a5ff
--- /dev/null
+++ b/static/eval_results/SI/Claude_3.5_new/task_results.json
@@ -0,0 +1,4818 @@
+[
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.23684210526315788,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.40241040325976846,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.5952380952380951,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.5151515151515151,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.40476190476190477,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.9642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.6607142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.7777777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.4946078431372549,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.06110399705595322,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.003401360544217687,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.5750644816731951,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.7647058823529411,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.02989318393830872,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.24489795918367346,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.6530612244897959,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.7402142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.7368421052631579,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.7589285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.4583333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.9583333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_comparison",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.42105263157894735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.9747899159663866,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.8218571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.32653061224489793,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.18566544566544566,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.8389999999999999,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rebus",
+ "score": 0.5217391304347826,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.17647058823529413,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.26289170215820523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.868,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.6875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.6875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.711111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.7261904761904762,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.6805363628538211,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.78125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.8650793650793652,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "algebra",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.8811526315789474,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.6339993725717702,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.4444444444444445,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.9166666666666669,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.9480519480519481,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.2531109353882501,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.7710526315789472,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.7642857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.6062431664706708,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.08106406283795066,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.6274393183836207,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.82,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.9743499999999999,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.9285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.23921428571428613,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "dvqa",
+ "score": 0.9473684210526315,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.5666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.5529411764705883,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.6928571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.8235294117647058,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.5210884353741496,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.47066326530612246,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.7428571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.6285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.6285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.5,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.6888888888888888,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.8642857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.282,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.6357142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.8578947368421055,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.7142857142857144,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.9103448275862066,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.6000000000000002,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.8620689655172412,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.8586206896551724,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.893103448275862,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.43571428571428567,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.7038461538461539,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.7928571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.7266666666666666,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.4580645161290323,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.6931034482758621,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.8344827586206894,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.7071428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.8379310344827589,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.7413793103448276,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.786206896551724,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.8379310344827587,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.942857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.6642857142857144,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.7785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.9650000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.825,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.7699999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.8350000000000002,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8842105263157897,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.29999999999999993,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.32142857142857145,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.35,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.607142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.6214285714285713,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.49999999999999994,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.6071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.5214285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.5928571428571427,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/SI/GPT_4o/summary_results.json b/static/eval_results/SI/GPT_4o/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..3fd663e31742e6d071f91953e3b43b831fc05c54
--- /dev/null
+++ b/static/eval_results/SI/GPT_4o/summary_results.json
@@ -0,0 +1,215 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 273,
+ "num_eval_samples": 4116,
+ "macro_mean_score": 0.5529953662872719,
+ "micro_mean_score": 0.5483479105928085
+ },
+ "open": {
+ "num_eval_tasks": 42,
+ "num_eval_samples": 813,
+ "macro_mean_score": 0.6600228904804206,
+ "micro_mean_score": 0.6801968019680197
+ },
+ "overall_score": 0.5672657028463584
+ },
+ "keyword_stats": {
+ "skills": {
+ "Text Recognition (OCR)": {
+ "count": 101,
+ "num_samples": 1687,
+ "tasks": [],
+ "average_score": 0.6400436962819274
+ },
+ "Object Recognition and Classification": {
+ "count": 172,
+ "num_samples": 2714,
+ "tasks": [],
+ "average_score": 0.5798789532163023
+ },
+ "Scene and Event Understanding": {
+ "count": 60,
+ "num_samples": 1004,
+ "tasks": [],
+ "average_score": 0.6933181759121947
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 91,
+ "num_samples": 1630,
+ "tasks": [],
+ "average_score": 0.47164342831848766
+ },
+ "Language Understanding and Generation": {
+ "count": 102,
+ "num_samples": 1713,
+ "tasks": [],
+ "average_score": 0.6512174145248227
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 46,
+ "num_samples": 897,
+ "tasks": [],
+ "average_score": 0.5506629280904943
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 78,
+ "num_samples": 1273,
+ "tasks": [],
+ "average_score": 0.4267383416112408
+ },
+ "Planning and Decision Making": {
+ "count": 23,
+ "num_samples": 356,
+ "tasks": [],
+ "average_score": 0.1970421212123289
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 38,
+ "num_samples": 654,
+ "tasks": [],
+ "average_score": 0.6716375018861761
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 10,
+ "num_samples": 170,
+ "tasks": [],
+ "average_score": 0.7342894736842105
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 67,
+ "num_samples": 1123,
+ "tasks": [],
+ "average_score": 0.6093502418300007
+ },
+ "Text-Based Images and Documents": {
+ "count": 53,
+ "num_samples": 847,
+ "tasks": [],
+ "average_score": 0.4938444672052553
+ },
+ "Photographs": {
+ "count": 83,
+ "num_samples": 1315,
+ "tasks": [],
+ "average_score": 0.6107746700730057
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 88,
+ "num_samples": 1524,
+ "tasks": [],
+ "average_score": 0.533172482404735
+ },
+ "Artistic and Creative Content": {
+ "count": 22,
+ "num_samples": 389,
+ "tasks": [],
+ "average_score": 0.6086090683867454
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 2,
+ "num_samples": 30,
+ "tasks": [],
+ "average_score": 0.3427989299648589
+ }
+ },
+ "output_format": {
+ "structured_output": {
+ "count": 72,
+ "num_samples": 1121,
+ "tasks": [],
+ "average_score": 0.5370230887013343
+ },
+ "contextual_formatted_text": {
+ "count": 63,
+ "num_samples": 975,
+ "tasks": [],
+ "average_score": 0.5351259728352326
+ },
+ "exact_text": {
+ "count": 57,
+ "num_samples": 880,
+ "tasks": [],
+ "average_score": 0.6016521462358102
+ },
+ "numerical_data": {
+ "count": 39,
+ "num_samples": 694,
+ "tasks": [],
+ "average_score": 0.4632537848154335
+ },
+ "open_ended_output": {
+ "count": 51,
+ "num_samples": 991,
+ "tasks": [],
+ "average_score": 0.6563556079088679
+ },
+ "multiple_choice": {
+ "count": 33,
+ "num_samples": 567,
+ "tasks": [],
+ "average_score": 0.6204512659058113
+ }
+ },
+ "input_num": {
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.5672657028463585
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 41,
+ "num_samples": 644,
+ "tasks": [],
+ "average_score": 0.7387886231372116
+ },
+ "Planning": {
+ "count": 44,
+ "num_samples": 714,
+ "tasks": [],
+ "average_score": 0.302146719713088
+ },
+ "Coding": {
+ "count": 16,
+ "num_samples": 244,
+ "tasks": [],
+ "average_score": 0.5785991479925302
+ },
+ "Perception": {
+ "count": 82,
+ "num_samples": 1321,
+ "tasks": [],
+ "average_score": 0.6418126331560571
+ },
+ "Metrics": {
+ "count": 3,
+ "num_samples": 45,
+ "tasks": [],
+ "average_score": 0.626984126984127
+ },
+ "Science": {
+ "count": 22,
+ "num_samples": 469,
+ "tasks": [],
+ "average_score": 0.5184702350129554
+ },
+ "Knowledge": {
+ "count": 77,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.6073751328612617
+ },
+ "Mathematics": {
+ "count": 30,
+ "num_samples": 497,
+ "tasks": [],
+ "average_score": 0.4387500704123191
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/SI/GPT_4o/task_results.json b/static/eval_results/SI/GPT_4o/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..876a4f2ad46b798d3dd71ec35bfa178b47f72cc4
--- /dev/null
+++ b/static/eval_results/SI/GPT_4o/task_results.json
@@ -0,0 +1,4818 @@
+[
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.5564421945052599,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.45,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.5571428571428572,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.8928571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.619047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.5050505050505051,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.3571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.4379245788668292,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.40294117647058825,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.1858388265990491,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.5925323909834338,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.6470588235294118,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0009041591320072332,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.21052631578947367,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.01601312748867357,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_comparison",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.6632653061224488,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.4767857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.8947368421052632,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.6875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.3673469387755102,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.4583333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.1496598639455782,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rebus",
+ "score": 0.6956521739130435,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.5294117647058824,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.2631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.9747899159663866,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.7872142857142859,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.6918947368421055,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.6842105263157895,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.2785198065092178,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.828,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.8303571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.711111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.95,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.8095238095238095,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.3469387755102041,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.5982549376215841,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.8253968253968255,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.7131684210526317,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "algebra",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.5867591836191252,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.6444444444444445,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.9285714285714288,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.8766233766233764,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.2903422951989705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.7417368421052631,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.7142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.6477943776571286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.5807339650392197,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.18559785992971775,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.74,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.8333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.9764785714285713,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.9047619047619048,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.12478571428571421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "dvqa",
+ "score": 0.3684210526315789,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.55,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6000000000000002,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.7647058823529411,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.6785714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.4562925170068027,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.36553287981859406,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.6785714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.7428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.7571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.5428571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.6785714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.8949999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.9,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.7250000000000002,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.765,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.7578947368421054,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.7142857142857144,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.7214285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.6,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.7071428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.65,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.6785714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.6777777777777777,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.9142857142857145,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.36200000000000004,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.6928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.8315789473684211,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.37857142857142856,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.6642857142857144,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.3,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.7423076923076924,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.8428571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.8666666666666668,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.49354838709677434,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.6,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.9214285714285716,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.6357142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.6785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.2857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.2785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.3571428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.8620689655172411,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.8310344827586206,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.8793103448275862,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.8689655172413793,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.7310344827586206,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.9068965517241377,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.6172413793103447,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.627586206896552,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.8310344827586207,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.8275862068965518,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/SI/GPT_4o_mini/summary_results.json b/static/eval_results/SI/GPT_4o_mini/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc69433e9989576e5a8c7974b79e624af8c0838b
--- /dev/null
+++ b/static/eval_results/SI/GPT_4o_mini/summary_results.json
@@ -0,0 +1,215 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 273,
+ "num_eval_samples": 4116,
+ "macro_mean_score": 0.4431039098921726,
+ "micro_mean_score": 0.43780369290573373
+ },
+ "open": {
+ "num_eval_tasks": 42,
+ "num_eval_samples": 813,
+ "macro_mean_score": 0.595574663769726,
+ "micro_mean_score": 0.6334563345633456
+ },
+ "overall_score": 0.46343334374251305
+ },
+ "keyword_stats": {
+ "skills": {
+ "Text Recognition (OCR)": {
+ "count": 101,
+ "num_samples": 1687,
+ "tasks": [],
+ "average_score": 0.503118803606002
+ },
+ "Object Recognition and Classification": {
+ "count": 172,
+ "num_samples": 2714,
+ "tasks": [],
+ "average_score": 0.48241878593503174
+ },
+ "Scene and Event Understanding": {
+ "count": 60,
+ "num_samples": 1004,
+ "tasks": [],
+ "average_score": 0.5987052352447554
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 91,
+ "num_samples": 1630,
+ "tasks": [],
+ "average_score": 0.37680368570252215
+ },
+ "Language Understanding and Generation": {
+ "count": 102,
+ "num_samples": 1713,
+ "tasks": [],
+ "average_score": 0.5458509360302554
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 46,
+ "num_samples": 897,
+ "tasks": [],
+ "average_score": 0.4555977624507237
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 78,
+ "num_samples": 1273,
+ "tasks": [],
+ "average_score": 0.33277278942510824
+ },
+ "Planning and Decision Making": {
+ "count": 23,
+ "num_samples": 356,
+ "tasks": [],
+ "average_score": 0.17294565844175996
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 38,
+ "num_samples": 654,
+ "tasks": [],
+ "average_score": 0.5775026308600164
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 10,
+ "num_samples": 170,
+ "tasks": [],
+ "average_score": 0.7960714285714285
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 67,
+ "num_samples": 1123,
+ "tasks": [],
+ "average_score": 0.4645916955325127
+ },
+ "Text-Based Images and Documents": {
+ "count": 53,
+ "num_samples": 847,
+ "tasks": [],
+ "average_score": 0.3779902828155749
+ },
+ "Photographs": {
+ "count": 83,
+ "num_samples": 1315,
+ "tasks": [],
+ "average_score": 0.5569877095654321
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 88,
+ "num_samples": 1524,
+ "tasks": [],
+ "average_score": 0.4194828137611333
+ },
+ "Artistic and Creative Content": {
+ "count": 22,
+ "num_samples": 389,
+ "tasks": [],
+ "average_score": 0.5198662454862603
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 2,
+ "num_samples": 30,
+ "tasks": [],
+ "average_score": 0.1194248916897328
+ }
+ },
+ "output_format": {
+ "structured_output": {
+ "count": 72,
+ "num_samples": 1121,
+ "tasks": [],
+ "average_score": 0.4761935495255144
+ },
+ "contextual_formatted_text": {
+ "count": 63,
+ "num_samples": 975,
+ "tasks": [],
+ "average_score": 0.38282644938937405
+ },
+ "exact_text": {
+ "count": 57,
+ "num_samples": 880,
+ "tasks": [],
+ "average_score": 0.42048902061937554
+ },
+ "numerical_data": {
+ "count": 39,
+ "num_samples": 694,
+ "tasks": [],
+ "average_score": 0.3777213713726476
+ },
+ "open_ended_output": {
+ "count": 51,
+ "num_samples": 991,
+ "tasks": [],
+ "average_score": 0.5986898724975707
+ },
+ "multiple_choice": {
+ "count": 33,
+ "num_samples": 567,
+ "tasks": [],
+ "average_score": 0.5559184922821285
+ }
+ },
+ "input_num": {
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.46343334374251277
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 41,
+ "num_samples": 644,
+ "tasks": [],
+ "average_score": 0.5484747566251307
+ },
+ "Planning": {
+ "count": 44,
+ "num_samples": 714,
+ "tasks": [],
+ "average_score": 0.22983305008250185
+ },
+ "Coding": {
+ "count": 16,
+ "num_samples": 244,
+ "tasks": [],
+ "average_score": 0.4556095354808589
+ },
+ "Perception": {
+ "count": 82,
+ "num_samples": 1321,
+ "tasks": [],
+ "average_score": 0.5437015929631214
+ },
+ "Metrics": {
+ "count": 3,
+ "num_samples": 45,
+ "tasks": [],
+ "average_score": 0.4873015873015873
+ },
+ "Science": {
+ "count": 22,
+ "num_samples": 469,
+ "tasks": [],
+ "average_score": 0.39601047285667923
+ },
+ "Knowledge": {
+ "count": 77,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.535145025177205
+ },
+ "Mathematics": {
+ "count": 30,
+ "num_samples": 497,
+ "tasks": [],
+ "average_score": 0.33759329198549914
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/SI/GPT_4o_mini/task_results.json b/static/eval_results/SI/GPT_4o_mini/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..25b2399a65df86e6e32cbc02e5ce63907af6297b
--- /dev/null
+++ b/static/eval_results/SI/GPT_4o_mini/task_results.json
@@ -0,0 +1,4818 @@
+[
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.4404761904761905,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.02971437714058806,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.2653061224489796,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.6785714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rebus",
+ "score": 0.30434782608695654,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.6470588235294118,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.9119047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.3348039215686274,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.49999999999999994,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.5857142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.6904761904761906,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.48571428571428577,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.43979842890651355,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.36666666666666664,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.7857142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.5294117647058824,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.29292929292929293,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.5882352941176472,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.22491496598639452,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.2505668934240363,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.17982456140350878,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.048713528589567665,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.024564069093751337,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "dvqa",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.6020408163265306,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.6480000000000001,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.7010526315789474,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.43662631578947375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.2631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.6955714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.457498007685276,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.5423192899685483,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.3777777777777777,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.6498716440678927,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.5455714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.27142857142857146,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.6964285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.7275263157894736,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.20833333333333334,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.5535393001296958,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "algebra",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.9505142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.9375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.6875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.1969956173950675,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.24388210678357394,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.4166666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.29411764705882354,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.5789473684210527,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.8506493506493505,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.6357142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.8769841269841271,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.9705882352941178,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.37142857142857144,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.8125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.11428571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.41428571428571426,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.43050085804176885,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.05714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.004214285714285663,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.7539682539682541,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.10526315789473684,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.2785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.6142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.8200000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.7214285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.65,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.6931034482758621,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.7615384615384616,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.875862068965517,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.5206896551724138,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.8379310344827584,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.8857142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.872413793103448,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.7206896551724139,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.8482758620689654,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.817241379310345,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.8379310344827586,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.9000000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.8210526315789474,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.7827586206896552,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.4548387096774193,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.6857142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.5714285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.5666666666666667,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.8950000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8000000000000002,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.8800000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.42857142857142855,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.14285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.05714285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.7750000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.2071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.16428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.5142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.3285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.33571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.72,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.6,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.3559999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.6571428571428573,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.5142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/SI/Gemini_1.5_flash_002/summary_results.json b/static/eval_results/SI/Gemini_1.5_flash_002/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..95d4d30ad782486786b62a3c713ef22d885e1b95
--- /dev/null
+++ b/static/eval_results/SI/Gemini_1.5_flash_002/summary_results.json
@@ -0,0 +1,215 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 273,
+ "num_eval_samples": 4116,
+ "macro_mean_score": 0.43481964330318734,
+ "micro_mean_score": 0.4297862001943635
+ },
+ "open": {
+ "num_eval_tasks": 42,
+ "num_eval_samples": 813,
+ "macro_mean_score": 0.5787083135236054,
+ "micro_mean_score": 0.6186961869618696
+ },
+ "overall_score": 0.4540047993325765
+ },
+ "keyword_stats": {
+ "skills": {
+ "Text Recognition (OCR)": {
+ "count": 101,
+ "num_samples": 1687,
+ "tasks": [],
+ "average_score": 0.4474763430506795
+ },
+ "Object Recognition and Classification": {
+ "count": 172,
+ "num_samples": 2714,
+ "tasks": [],
+ "average_score": 0.47630441828533016
+ },
+ "Scene and Event Understanding": {
+ "count": 60,
+ "num_samples": 1004,
+ "tasks": [],
+ "average_score": 0.5920539115535787
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 91,
+ "num_samples": 1630,
+ "tasks": [],
+ "average_score": 0.4086167264646781
+ },
+ "Language Understanding and Generation": {
+ "count": 102,
+ "num_samples": 1713,
+ "tasks": [],
+ "average_score": 0.5122400421391089
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 46,
+ "num_samples": 897,
+ "tasks": [],
+ "average_score": 0.4655431430975485
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 78,
+ "num_samples": 1273,
+ "tasks": [],
+ "average_score": 0.3559690476405975
+ },
+ "Planning and Decision Making": {
+ "count": 23,
+ "num_samples": 356,
+ "tasks": [],
+ "average_score": 0.09741974015331743
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 38,
+ "num_samples": 654,
+ "tasks": [],
+ "average_score": 0.5955368143490581
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 10,
+ "num_samples": 170,
+ "tasks": [],
+ "average_score": 0.7948947368421052
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 67,
+ "num_samples": 1123,
+ "tasks": [],
+ "average_score": 0.3722082840493195
+ },
+ "Text-Based Images and Documents": {
+ "count": 53,
+ "num_samples": 847,
+ "tasks": [],
+ "average_score": 0.33052002642818507
+ },
+ "Photographs": {
+ "count": 83,
+ "num_samples": 1315,
+ "tasks": [],
+ "average_score": 0.5613400178213946
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 88,
+ "num_samples": 1524,
+ "tasks": [],
+ "average_score": 0.46724590271207767
+ },
+ "Artistic and Creative Content": {
+ "count": 22,
+ "num_samples": 389,
+ "tasks": [],
+ "average_score": 0.5535202379362348
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 2,
+ "num_samples": 30,
+ "tasks": [],
+ "average_score": 0.3348446026637953
+ }
+ },
+ "output_format": {
+ "structured_output": {
+ "count": 72,
+ "num_samples": 1121,
+ "tasks": [],
+ "average_score": 0.43823554216399857
+ },
+ "contextual_formatted_text": {
+ "count": 63,
+ "num_samples": 975,
+ "tasks": [],
+ "average_score": 0.3691249729531883
+ },
+ "exact_text": {
+ "count": 57,
+ "num_samples": 880,
+ "tasks": [],
+ "average_score": 0.42013434507914493
+ },
+ "numerical_data": {
+ "count": 39,
+ "num_samples": 694,
+ "tasks": [],
+ "average_score": 0.43247267273235235
+ },
+ "open_ended_output": {
+ "count": 51,
+ "num_samples": 991,
+ "tasks": [],
+ "average_score": 0.5470781816319514
+ },
+ "multiple_choice": {
+ "count": 33,
+ "num_samples": 567,
+ "tasks": [],
+ "average_score": 0.5905636451090996
+ }
+ },
+ "input_num": {
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.45400479933257654
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 41,
+ "num_samples": 644,
+ "tasks": [],
+ "average_score": 0.45245079667466714
+ },
+ "Planning": {
+ "count": 44,
+ "num_samples": 714,
+ "tasks": [],
+ "average_score": 0.21148887498941377
+ },
+ "Coding": {
+ "count": 16,
+ "num_samples": 244,
+ "tasks": [],
+ "average_score": 0.47487599206349207
+ },
+ "Perception": {
+ "count": 82,
+ "num_samples": 1321,
+ "tasks": [],
+ "average_score": 0.5468998820129136
+ },
+ "Metrics": {
+ "count": 3,
+ "num_samples": 45,
+ "tasks": [],
+ "average_score": 0.4380952380952381
+ },
+ "Science": {
+ "count": 22,
+ "num_samples": 469,
+ "tasks": [],
+ "average_score": 0.48499051643275837
+ },
+ "Knowledge": {
+ "count": 77,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.5086518140501541
+ },
+ "Mathematics": {
+ "count": 30,
+ "num_samples": 497,
+ "tasks": [],
+ "average_score": 0.3853815223607656
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/SI/Gemini_1.5_flash_002/task_results.json b/static/eval_results/SI/Gemini_1.5_flash_002/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..acd24bb35a4b89f35e2ec3c316d485f0d30c39d1
--- /dev/null
+++ b/static/eval_results/SI/Gemini_1.5_flash_002/task_results.json
@@ -0,0 +1,4818 @@
+[
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.3095238095238095,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.6785714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.8888888888888888,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.4068877551020408,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.3469387755102041,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.03886509470801488,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.8421052631578947,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.6785714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.6470588235294118,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.22271751659129607,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.5476190476190476,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.47990196078431374,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.2727272727272727,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.3100359127375053,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.0319296239070534,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.9404761904761905,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.15,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.49714178831993683,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.7261904761904762,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.30612244897959184,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.23529411764705882,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.09826063389901919,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "dvqa",
+ "score": 0.9473684210526315,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.5408163265306122,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.5916519873131821,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.6785714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.2894736842105263,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.5600000000000002,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.746390336033466,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.3157894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.9621285714285712,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "algebra",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.7057894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.5798723155227672,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.9017526315789473,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.7478991596638657,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.5555555555555555,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.570486129111546,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.7672857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.21428571428571433,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.7220526315789474,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.24564101770091742,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.11578571428571437,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.7727272727272726,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.9375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.7539682539682538,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.589357142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.84375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.3928571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.19999999999999998,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.15714285714285717,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.4714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.24492301011444534,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.753968253968254,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rebus",
+ "score": 0.30434782608695654,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.07619047619047618,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.35000000000000003,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.3137755102040816,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.2828798185941043,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6000000000000001,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.6785714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.7666666666666668,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.8571428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.5928571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.8263157894736842,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.6214285714285716,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.45000000000000007,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.6071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.9068965517241379,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.7642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.8533333333333335,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.789655172413793,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.8551724137931035,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.7758620689655171,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.6310344827586206,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.3071428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.8137931034482758,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.1857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.22142857142857145,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.29999999999999993,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.8448275862068967,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.33571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.8285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.2785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.5714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.47857142857142865,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.8300000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.8500000000000002,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.6884615384615385,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.3806451612903227,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.705,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.24285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.5214285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.7850000000000004,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8789473684210528,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.15,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.32142857142857134,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.6482758620689654,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.8689655172413793,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.7068965517241379,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.1642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/SI/Gemini_1.5_pro_002/summary_results.json b/static/eval_results/SI/Gemini_1.5_pro_002/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b782cc9ac005a5507ee5cb19b6ce2a0a0098b4ca
--- /dev/null
+++ b/static/eval_results/SI/Gemini_1.5_pro_002/summary_results.json
@@ -0,0 +1,215 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 273,
+ "num_eval_samples": 4116,
+ "macro_mean_score": 0.4914311038229404,
+ "micro_mean_score": 0.48323615160349853
+ },
+ "open": {
+ "num_eval_tasks": 42,
+ "num_eval_samples": 813,
+ "macro_mean_score": 0.5814975405131552,
+ "micro_mean_score": 0.6174661746617466
+ },
+ "overall_score": 0.5034399620483024
+ },
+ "keyword_stats": {
+ "skills": {
+ "Text Recognition (OCR)": {
+ "count": 101,
+ "num_samples": 1687,
+ "tasks": [],
+ "average_score": 0.5000257619938475
+ },
+ "Object Recognition and Classification": {
+ "count": 172,
+ "num_samples": 2714,
+ "tasks": [],
+ "average_score": 0.5220033468415737
+ },
+ "Scene and Event Understanding": {
+ "count": 60,
+ "num_samples": 1004,
+ "tasks": [],
+ "average_score": 0.6342882147970302
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 91,
+ "num_samples": 1630,
+ "tasks": [],
+ "average_score": 0.448099634405986
+ },
+ "Language Understanding and Generation": {
+ "count": 102,
+ "num_samples": 1713,
+ "tasks": [],
+ "average_score": 0.5647567827649111
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 46,
+ "num_samples": 897,
+ "tasks": [],
+ "average_score": 0.5090111123207751
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 78,
+ "num_samples": 1273,
+ "tasks": [],
+ "average_score": 0.3972807544005462
+ },
+ "Planning and Decision Making": {
+ "count": 23,
+ "num_samples": 356,
+ "tasks": [],
+ "average_score": 0.178032259819607
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 38,
+ "num_samples": 654,
+ "tasks": [],
+ "average_score": 0.5995804836966744
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 10,
+ "num_samples": 170,
+ "tasks": [],
+ "average_score": 0.7830639097744362
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 67,
+ "num_samples": 1123,
+ "tasks": [],
+ "average_score": 0.42724302639929596
+ },
+ "Text-Based Images and Documents": {
+ "count": 53,
+ "num_samples": 847,
+ "tasks": [],
+ "average_score": 0.4060403716629095
+ },
+ "Photographs": {
+ "count": 83,
+ "num_samples": 1315,
+ "tasks": [],
+ "average_score": 0.5888558035357285
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 88,
+ "num_samples": 1524,
+ "tasks": [],
+ "average_score": 0.5132563067393096
+ },
+ "Artistic and Creative Content": {
+ "count": 22,
+ "num_samples": 389,
+ "tasks": [],
+ "average_score": 0.6217290675275775
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 2,
+ "num_samples": 30,
+ "tasks": [],
+ "average_score": 0.3592697030984118
+ }
+ },
+ "output_format": {
+ "structured_output": {
+ "count": 72,
+ "num_samples": 1121,
+ "tasks": [],
+ "average_score": 0.4972242280053817
+ },
+ "contextual_formatted_text": {
+ "count": 63,
+ "num_samples": 975,
+ "tasks": [],
+ "average_score": 0.43754003302746525
+ },
+ "exact_text": {
+ "count": 57,
+ "num_samples": 880,
+ "tasks": [],
+ "average_score": 0.4731762319443037
+ },
+ "numerical_data": {
+ "count": 39,
+ "num_samples": 694,
+ "tasks": [],
+ "average_score": 0.48334866543174226
+ },
+ "open_ended_output": {
+ "count": 51,
+ "num_samples": 991,
+ "tasks": [],
+ "average_score": 0.5644701189535662
+ },
+ "multiple_choice": {
+ "count": 33,
+ "num_samples": 567,
+ "tasks": [],
+ "average_score": 0.6245091608727974
+ }
+ },
+ "input_num": {
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.5034399620483027
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 41,
+ "num_samples": 644,
+ "tasks": [],
+ "average_score": 0.504539358390968
+ },
+ "Planning": {
+ "count": 44,
+ "num_samples": 714,
+ "tasks": [],
+ "average_score": 0.28536490696494377
+ },
+ "Coding": {
+ "count": 16,
+ "num_samples": 244,
+ "tasks": [],
+ "average_score": 0.48587549603174607
+ },
+ "Perception": {
+ "count": 82,
+ "num_samples": 1321,
+ "tasks": [],
+ "average_score": 0.5964613809728712
+ },
+ "Metrics": {
+ "count": 3,
+ "num_samples": 45,
+ "tasks": [],
+ "average_score": 0.48888888888888893
+ },
+ "Science": {
+ "count": 22,
+ "num_samples": 469,
+ "tasks": [],
+ "average_score": 0.500158537824293
+ },
+ "Knowledge": {
+ "count": 77,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.5660366627264668
+ },
+ "Mathematics": {
+ "count": 30,
+ "num_samples": 497,
+ "tasks": [],
+ "average_score": 0.4200866579901879
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/SI/Gemini_1.5_pro_002/task_results.json b/static/eval_results/SI/Gemini_1.5_pro_002/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..f197c4416bb4dd920f70a2abce6c2201fbb412d2
--- /dev/null
+++ b/static/eval_results/SI/Gemini_1.5_pro_002/task_results.json
@@ -0,0 +1,4818 @@
+[
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.6199454600186646,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.6470588235294118,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counting",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.11764705882352941,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.15789473684210525,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.4119942575491687,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.45,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.29292929292929293,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.31428571428571433,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.4699566675933124,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.33035714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.4656862745098039,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.9642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.7777777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Coding",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.06762834530316385,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0035714285714285718,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.3877551020408163,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.8025210084033615,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.831857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.09619047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "rebus",
+ "score": 0.391304347826087,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.03864007436439077,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.6723157894736841,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.30612244897959184,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.30454267975765786,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.5510204081632654,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.668,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.7153571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.48214285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_execution",
+ "score": 0.4375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.48214285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.7777777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.9523809523809523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.6842105263157895,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.5238095238095237,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Planning and Decision Making",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.84375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.14711083476825218,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.7460317460317462,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.6758816417011395,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_parity",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "figureqa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.9375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.82,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.8486368421052632,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "algebra",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.555696767990635,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iconqa",
+ "score": 0.42105263157894735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.4666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.7089473684210525,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.8174603174603176,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.3857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.8246753246753247,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.66869355335515,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.9712357142857144,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.32509082865144884,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.2693571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "dvqa",
+ "score": 0.7368421052631579,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.8333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.7357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.35,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Metrics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Language Understanding and Generation",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.35294117647058826,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Mathematics",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.36734693877551017,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.40232426303854874,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Information_Extraction",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Mathematics",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation",
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6117647058823531,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.5945319390969315,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.16666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.4142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.5142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.6285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.3714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Object Recognition and Classification",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.3157894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Information_Extraction",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Mathematical and Logical Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "3D Models and Aerial Imagery",
+ "app": "Perception",
+ "output_format": "multiple_choice",
+ "num_input": "1-image"
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "exact_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "numerical_data",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.14285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "contextual_formatted_text",
+ "num_input": "1-image"
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.5714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.7111111111111111,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.9,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.8263157894736843,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.5357142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.48,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.692857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.8896551724137929,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.7571428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.8,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.8241379310344826,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.8206896551724137,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.8103448275862067,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.42580645161290337,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "iq_test",
+ "score": 0.6,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Spatial and Temporal Reasoning"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.8758620689655172,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Spatial and Temporal Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.5714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.6655172413793102,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Language Understanding and Generation",
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.6000000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Object Recognition and Classification",
+ "Mathematical and Logical Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.8137931034482755,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.7965517241379312,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29,
+ "skills": [
+ "Scene and Event Understanding",
+ "Language Understanding and Generation",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.8857142857142859,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Scene and Event Understanding",
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.5142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Coding",
+ "output_format": "structured_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.5285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Perception",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.82,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.7750000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Text Recognition (OCR)"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.74,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Scene and Event Understanding",
+ "Commonsense and Social Reasoning",
+ "Ethical and Safety Reasoning"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.2928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Text-Based Images and Documents",
+ "app": "Science",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.6346153846153848,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26,
+ "skills": [
+ "Text Recognition (OCR)",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Information_Extraction",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.765,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Scene and Event Understanding",
+ "Object Recognition and Classification"
+ ],
+ "input_format": "Photographs",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8842105263157897,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19,
+ "skills": [
+ "Ethical and Safety Reasoning",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "Artistic and Creative Content",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.21428571428571433,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Planning and Decision Making",
+ "Mathematical and Logical Reasoning",
+ "Domain-Specific Knowledge and Skills"
+ ],
+ "input_format": "Diagrams and Data Visualizations",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.35000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.22142857142857145,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.15000000000000005,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.3428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Object Recognition and Classification",
+ "Spatial and Temporal Reasoning",
+ "Planning and Decision Making"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Planning",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.22142857142857147,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Commonsense and Social Reasoning",
+ "Language Understanding and Generation"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.3000000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.23571428571428577,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.33571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14,
+ "skills": [
+ "Language Understanding and Generation",
+ "Commonsense and Social Reasoning"
+ ],
+ "input_format": "User Interface Screenshots",
+ "app": "Knowledge",
+ "output_format": "open_ended_output",
+ "num_input": "1-image"
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/SI/Idefics3/summary_results.json b/static/eval_results/SI/Idefics3/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..486dce3311f80e350c2765b963dfc7581e29f78f
--- /dev/null
+++ b/static/eval_results/SI/Idefics3/summary_results.json
@@ -0,0 +1,219 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 273,
+ "num_eval_samples": 4116,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.08941182847569326,
+ "micro_mean_score": 0.08779475233900695,
+ "missing_tasks": []
+ },
+ "open": {
+ "num_eval_tasks": 42,
+ "num_eval_samples": 813,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.3231434267517844,
+ "micro_mean_score": 0.3618081180811809,
+ "missing_tasks": []
+ },
+ "overall_score": 0.12057604157917208
+ },
+ "keyword_stats": {
+ "skills": {
+ "Text Recognition (OCR)": {
+ "count": 101,
+ "num_samples": 1687,
+ "tasks": [],
+ "average_score": 0.07893017100109866
+ },
+ "Object Recognition and Classification": {
+ "count": 172,
+ "num_samples": 2714,
+ "tasks": [],
+ "average_score": 0.12579260798514427
+ },
+ "Scene and Event Understanding": {
+ "count": 60,
+ "num_samples": 1004,
+ "tasks": [],
+ "average_score": 0.15897902615904647
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 91,
+ "num_samples": 1630,
+ "tasks": [],
+ "average_score": 0.1275512898313342
+ },
+ "Language Understanding and Generation": {
+ "count": 102,
+ "num_samples": 1713,
+ "tasks": [],
+ "average_score": 0.1724799353848912
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 46,
+ "num_samples": 897,
+ "tasks": [],
+ "average_score": 0.1166739111764397
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 78,
+ "num_samples": 1273,
+ "tasks": [],
+ "average_score": 0.09276606649010487
+ },
+ "Planning and Decision Making": {
+ "count": 23,
+ "num_samples": 356,
+ "tasks": [],
+ "average_score": 0.014803312629399587
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 38,
+ "num_samples": 654,
+ "tasks": [],
+ "average_score": 0.2126465842330819
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 10,
+ "num_samples": 170,
+ "tasks": [],
+ "average_score": 0.2774436090225564
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 67,
+ "num_samples": 1123,
+ "tasks": [],
+ "average_score": 0.03857183991826921
+ },
+ "Text-Based Images and Documents": {
+ "count": 53,
+ "num_samples": 847,
+ "tasks": [],
+ "average_score": 0.06561871098996794
+ },
+ "Photographs": {
+ "count": 83,
+ "num_samples": 1315,
+ "tasks": [],
+ "average_score": 0.171712228743858
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 88,
+ "num_samples": 1524,
+ "tasks": [],
+ "average_score": 0.14766910173600153
+ },
+ "Artistic and Creative Content": {
+ "count": 22,
+ "num_samples": 389,
+ "tasks": [],
+ "average_score": 0.21050154891192577
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 2,
+ "num_samples": 30,
+ "tasks": [],
+ "average_score": 0.020659062938075456
+ }
+ },
+ "output_format": {
+ "structured_output": {
+ "count": 72,
+ "num_samples": 1121,
+ "tasks": [],
+ "average_score": 0.02100010342704044
+ },
+ "contextual_formatted_text": {
+ "count": 63,
+ "num_samples": 975,
+ "tasks": [],
+ "average_score": 0.15091196450213815
+ },
+ "exact_text": {
+ "count": 57,
+ "num_samples": 880,
+ "tasks": [],
+ "average_score": 0.053829016986911726
+ },
+ "numerical_data": {
+ "count": 39,
+ "num_samples": 694,
+ "tasks": [],
+ "average_score": 0.10744987600153451
+ },
+ "open_ended_output": {
+ "count": 51,
+ "num_samples": 991,
+ "tasks": [],
+ "average_score": 0.2975217887286715
+ },
+ "multiple_choice": {
+ "count": 33,
+ "num_samples": 567,
+ "tasks": [],
+ "average_score": 0.13726004635095543
+ }
+ },
+ "input_num": {
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.12057604157917215
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 41,
+ "num_samples": 644,
+ "tasks": [],
+ "average_score": 0.03610947192711297
+ },
+ "Planning": {
+ "count": 44,
+ "num_samples": 714,
+ "tasks": [],
+ "average_score": 0.04525221984520586
+ },
+ "Coding": {
+ "count": 16,
+ "num_samples": 244,
+ "tasks": [],
+ "average_score": 0.10420386904761905
+ },
+ "Perception": {
+ "count": 82,
+ "num_samples": 1321,
+ "tasks": [],
+ "average_score": 0.17708549842279478
+ },
+ "Metrics": {
+ "count": 3,
+ "num_samples": 45,
+ "tasks": [],
+ "average_score": 0.19999999999999998
+ },
+ "Science": {
+ "count": 22,
+ "num_samples": 469,
+ "tasks": [],
+ "average_score": 0.1804888391778344
+ },
+ "Knowledge": {
+ "count": 77,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.14759816564804443
+ },
+ "Mathematics": {
+ "count": 30,
+ "num_samples": 497,
+ "tasks": [],
+ "average_score": 0.07952603609985566
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/SI/Idefics3/task_results.json b/static/eval_results/SI/Idefics3/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..866de33ec0c590e428ff19cdab156621b8786256
--- /dev/null
+++ b/static/eval_results/SI/Idefics3/task_results.json
@@ -0,0 +1,2207 @@
+[
+ {
+ "name": "location_vqa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.11904761904761904,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.05,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_execution",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.4833333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.27647058823529413,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counting",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.10101010101010101,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "rebus",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.004464285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.0836931120056328,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.15789473684210525,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.047619047619047616,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.02777777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "dvqa",
+ "score": 0.42105263157894735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.07375100780769979,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.5625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.025974025974025976,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.0659285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "algebra",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "iconqa",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.40599999999999997,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.7581428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.030612244897959183,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.017857142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.04131812587615091,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.008403361344537815,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_parity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.14842105263157898,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.21052631578947367,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.008928571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "figureqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.07094736842105263,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.09411764705882353,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.014285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.042857142857142864,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.05714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.05714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.6034482758620691,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.41428571428571426,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.3448275862068966,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "iq_test",
+ "score": 0.2448275862068965,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.4,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.5689655172413793,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.08888888888888889,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.36896551724137927,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.5655172413793104,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.5896551724137931,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.27586206896551724,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.26,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.27142857142857146,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.4551724137931034,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.2642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.09285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.4846153846153847,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.37142857142857144,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.122,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.23571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.435483870967742,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.4758620689655173,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.5842105263157894,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.42857142857142855,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.6428571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.08571428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.03571428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.03571428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.22142857142857145,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.20714285714285713,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8315789473684213,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.8650000000000002,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.68,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.2799999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.6399999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/SI/InternVL2_2B/summary_results.json b/static/eval_results/SI/InternVL2_2B/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b021be0db3f55396a6b2deb794e0c5aeea0bf1a8
--- /dev/null
+++ b/static/eval_results/SI/InternVL2_2B/summary_results.json
@@ -0,0 +1,219 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 273,
+ "num_eval_samples": 4116,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.12069001041308772,
+ "micro_mean_score": 0.11842605219090299,
+ "missing_tasks": []
+ },
+ "open": {
+ "num_eval_tasks": 42,
+ "num_eval_samples": 813,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.28522459992910454,
+ "micro_mean_score": 0.28886838868388687,
+ "missing_tasks": []
+ },
+ "overall_score": 0.14262795568189
+ },
+ "keyword_stats": {
+ "skills": {
+ "Text Recognition (OCR)": {
+ "count": 101,
+ "num_samples": 1687,
+ "tasks": [],
+ "average_score": 0.12376971454228163
+ },
+ "Object Recognition and Classification": {
+ "count": 172,
+ "num_samples": 2714,
+ "tasks": [],
+ "average_score": 0.13333012698269087
+ },
+ "Scene and Event Understanding": {
+ "count": 60,
+ "num_samples": 1004,
+ "tasks": [],
+ "average_score": 0.23055380602943532
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 91,
+ "num_samples": 1630,
+ "tasks": [],
+ "average_score": 0.1336101595652968
+ },
+ "Language Understanding and Generation": {
+ "count": 102,
+ "num_samples": 1713,
+ "tasks": [],
+ "average_score": 0.1905261989833371
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 46,
+ "num_samples": 897,
+ "tasks": [],
+ "average_score": 0.08201891993308255
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 78,
+ "num_samples": 1273,
+ "tasks": [],
+ "average_score": 0.11985812372011641
+ },
+ "Planning and Decision Making": {
+ "count": 23,
+ "num_samples": 356,
+ "tasks": [],
+ "average_score": 0.013664596273291925
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 38,
+ "num_samples": 654,
+ "tasks": [],
+ "average_score": 0.3035836752792625
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 10,
+ "num_samples": 170,
+ "tasks": [],
+ "average_score": 0.4728533834586467
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 67,
+ "num_samples": 1123,
+ "tasks": [],
+ "average_score": 0.049217594376760605
+ },
+ "Text-Based Images and Documents": {
+ "count": 53,
+ "num_samples": 847,
+ "tasks": [],
+ "average_score": 0.09447908809124074
+ },
+ "Photographs": {
+ "count": 83,
+ "num_samples": 1315,
+ "tasks": [],
+ "average_score": 0.22923075081716637
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 88,
+ "num_samples": 1524,
+ "tasks": [],
+ "average_score": 0.15159509081542988
+ },
+ "Artistic and Creative Content": {
+ "count": 22,
+ "num_samples": 389,
+ "tasks": [],
+ "average_score": 0.18693717087010792
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 2,
+ "num_samples": 30,
+ "tasks": [],
+ "average_score": 0.07184873949579831
+ }
+ },
+ "output_format": {
+ "structured_output": {
+ "count": 72,
+ "num_samples": 1121,
+ "tasks": [],
+ "average_score": 0.042960275283590164
+ },
+ "contextual_formatted_text": {
+ "count": 63,
+ "num_samples": 975,
+ "tasks": [],
+ "average_score": 0.12369372450210245
+ },
+ "exact_text": {
+ "count": 57,
+ "num_samples": 880,
+ "tasks": [],
+ "average_score": 0.11544832152620972
+ },
+ "numerical_data": {
+ "count": 39,
+ "num_samples": 694,
+ "tasks": [],
+ "average_score": 0.12291071957107838
+ },
+ "open_ended_output": {
+ "count": 51,
+ "num_samples": 991,
+ "tasks": [],
+ "average_score": 0.24746476545671045
+ },
+ "multiple_choice": {
+ "count": 33,
+ "num_samples": 567,
+ "tasks": [],
+ "average_score": 0.3044601862783681
+ }
+ },
+ "input_num": {
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.14262795568189013
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 41,
+ "num_samples": 644,
+ "tasks": [],
+ "average_score": 0.03678177133215256
+ },
+ "Planning": {
+ "count": 44,
+ "num_samples": 714,
+ "tasks": [],
+ "average_score": 0.039950835968771276
+ },
+ "Coding": {
+ "count": 16,
+ "num_samples": 244,
+ "tasks": [],
+ "average_score": 0.09082268323996265
+ },
+ "Perception": {
+ "count": 82,
+ "num_samples": 1321,
+ "tasks": [],
+ "average_score": 0.19769593666817548
+ },
+ "Metrics": {
+ "count": 3,
+ "num_samples": 45,
+ "tasks": [],
+ "average_score": 0.3428571428571428
+ },
+ "Science": {
+ "count": 22,
+ "num_samples": 469,
+ "tasks": [],
+ "average_score": 0.15289272275533383
+ },
+ "Knowledge": {
+ "count": 77,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.20753533217719797
+ },
+ "Mathematics": {
+ "count": 30,
+ "num_samples": 497,
+ "tasks": [],
+ "average_score": 0.12084183290294437
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/SI/InternVL2_2B/task_results.json b/static/eval_results/SI/InternVL2_2B/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..2e8cd4636185c55541c18106507a48e601a70573
--- /dev/null
+++ b/static/eval_results/SI/InternVL2_2B/task_results.json
@@ -0,0 +1,2207 @@
+[
+ {
+ "name": "location_vqa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.02040816326530612,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.11764705882352941,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.04081632653061224,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.21052631578947367,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.35,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.11764705882352941,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_execution",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.042857142857142864,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.7511904761904761,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.07843137254901962,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counting",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.020202020202020204,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "rebus",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.11764705882352941,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.1616282025296959,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.15789473684210525,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.023809523809523808,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.22943157894736843,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.04795530132517215,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.025015859584362813,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "dvqa",
+ "score": 0.7368421052631579,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.3781550944997167,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.032467532467532464,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.20764285714285705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "algebra",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "iconqa",
+ "score": 0.10526315789473684,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.3585714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.4424142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.09285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.0008403361344537821,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.03968253968253969,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.004201680672268907,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_parity",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.08888888888888888,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.21052631578947367,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.4773684210526316,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.05263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.017857142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "figureqa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.513578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.15294117647058825,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.04591836734693877,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.18534580498866213,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.014285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.014285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.2275862068965517,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.4499999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.1827586206896552,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "iq_test",
+ "score": 0.031034482758620686,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.5428571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.37241379310344824,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.3444444444444445,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.196551724137931,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.296551724137931,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.27241379310344827,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.1827586206896551,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.22666666666666663,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.39999999999999997,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.1586206896551724,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.2571428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.24285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.5423076923076923,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.5642857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.054000000000000006,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.3714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.4741935483870969,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.2413793103448276,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.7157894736842104,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.5214285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.21428571428571427,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.04285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.07142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.08571428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.14285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.0642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.04285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.32142857142857145,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.05714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8421052631578949,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.7100000000000003,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.5700000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.27,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.5750000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/SI/InternVL2_76B/summary_results.json b/static/eval_results/SI/InternVL2_76B/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..6aa5a3d95b342f82d82ffccd2bcf0a2a4db5aeab
--- /dev/null
+++ b/static/eval_results/SI/InternVL2_76B/summary_results.json
@@ -0,0 +1,219 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 273,
+ "num_eval_samples": 4116,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.3998616568018755,
+ "micro_mean_score": 0.39149064302628933,
+ "missing_tasks": []
+ },
+ "open": {
+ "num_eval_tasks": 42,
+ "num_eval_samples": 813,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.554748737158244,
+ "micro_mean_score": 0.5800738007380073,
+ "missing_tasks": []
+ },
+ "overall_score": 0.42051326751605805
+ },
+ "keyword_stats": {
+ "skills": {
+ "Text Recognition (OCR)": {
+ "count": 101,
+ "num_samples": 1687,
+ "tasks": [],
+ "average_score": 0.4672429826553732
+ },
+ "Object Recognition and Classification": {
+ "count": 172,
+ "num_samples": 2714,
+ "tasks": [],
+ "average_score": 0.4230856844269695
+ },
+ "Scene and Event Understanding": {
+ "count": 60,
+ "num_samples": 1004,
+ "tasks": [],
+ "average_score": 0.570666577587141
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 91,
+ "num_samples": 1630,
+ "tasks": [],
+ "average_score": 0.3413715846680563
+ },
+ "Language Understanding and Generation": {
+ "count": 102,
+ "num_samples": 1713,
+ "tasks": [],
+ "average_score": 0.470239452171767
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 46,
+ "num_samples": 897,
+ "tasks": [],
+ "average_score": 0.37110860027855824
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 78,
+ "num_samples": 1273,
+ "tasks": [],
+ "average_score": 0.3276283897777921
+ },
+ "Planning and Decision Making": {
+ "count": 23,
+ "num_samples": 356,
+ "tasks": [],
+ "average_score": 0.10153556963007855
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 38,
+ "num_samples": 654,
+ "tasks": [],
+ "average_score": 0.5873606708191794
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 10,
+ "num_samples": 170,
+ "tasks": [],
+ "average_score": 0.7041804511278196
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 67,
+ "num_samples": 1123,
+ "tasks": [],
+ "average_score": 0.39401514252711556
+ },
+ "Text-Based Images and Documents": {
+ "count": 53,
+ "num_samples": 847,
+ "tasks": [],
+ "average_score": 0.3333759749379774
+ },
+ "Photographs": {
+ "count": 83,
+ "num_samples": 1315,
+ "tasks": [],
+ "average_score": 0.5065289649268628
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 88,
+ "num_samples": 1524,
+ "tasks": [],
+ "average_score": 0.39253566766026804
+ },
+ "Artistic and Creative Content": {
+ "count": 22,
+ "num_samples": 389,
+ "tasks": [],
+ "average_score": 0.509332186292545
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 2,
+ "num_samples": 30,
+ "tasks": [],
+ "average_score": 0.30169355252977215
+ }
+ },
+ "output_format": {
+ "structured_output": {
+ "count": 72,
+ "num_samples": 1121,
+ "tasks": [],
+ "average_score": 0.4030580663588658
+ },
+ "contextual_formatted_text": {
+ "count": 63,
+ "num_samples": 975,
+ "tasks": [],
+ "average_score": 0.3863929410693585
+ },
+ "exact_text": {
+ "count": 57,
+ "num_samples": 880,
+ "tasks": [],
+ "average_score": 0.4041893680050902
+ },
+ "numerical_data": {
+ "count": 39,
+ "num_samples": 694,
+ "tasks": [],
+ "average_score": 0.34950523809271744
+ },
+ "open_ended_output": {
+ "count": 51,
+ "num_samples": 991,
+ "tasks": [],
+ "average_score": 0.48322911874283003
+ },
+ "multiple_choice": {
+ "count": 33,
+ "num_samples": 567,
+ "tasks": [],
+ "average_score": 0.5389260571078752
+ }
+ },
+ "input_num": {
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.4205132675160581
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 41,
+ "num_samples": 644,
+ "tasks": [],
+ "average_score": 0.4585598678029664
+ },
+ "Planning": {
+ "count": 44,
+ "num_samples": 714,
+ "tasks": [],
+ "average_score": 0.1619462380866451
+ },
+ "Coding": {
+ "count": 16,
+ "num_samples": 244,
+ "tasks": [],
+ "average_score": 0.42624956232493
+ },
+ "Perception": {
+ "count": 82,
+ "num_samples": 1321,
+ "tasks": [],
+ "average_score": 0.5361401478255164
+ },
+ "Metrics": {
+ "count": 3,
+ "num_samples": 45,
+ "tasks": [],
+ "average_score": 0.5301587301587302
+ },
+ "Science": {
+ "count": 22,
+ "num_samples": 469,
+ "tasks": [],
+ "average_score": 0.38874625564304305
+ },
+ "Knowledge": {
+ "count": 77,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.47251288369387245
+ },
+ "Mathematics": {
+ "count": 30,
+ "num_samples": 497,
+ "tasks": [],
+ "average_score": 0.3075073077960568
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/SI/InternVL2_76B/task_results.json b/static/eval_results/SI/InternVL2_76B/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e8028662a4e4e2be537ea34d75d0f5b930e9cc88
--- /dev/null
+++ b/static/eval_results/SI/InternVL2_76B/task_results.json
@@ -0,0 +1,2207 @@
+[
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.48333333333333334,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.07099999999999997,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.5204081632653061,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.68,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.041666666666666664,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.474485256080059,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.03195853363097289,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "dvqa",
+ "score": 0.7894736842105263,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.8961038961038961,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.6839999999999999,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.41731304954954773,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.4888888888888889,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.8949579831932775,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.5306842105263159,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.7460317460317459,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "figureqa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.7437368421052631,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.748642857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.5789473684210527,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.5857142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_parity",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.84375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.45535714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.892857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "algebra",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.9443785714285712,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.5548571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.6581526315789472,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "iconqa",
+ "score": 0.3684210526315789,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.5535714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.7380952380952381,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.7777777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.5982142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.4888888888888889,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.2777777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.11428571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.11428571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.028571428571428574,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.042857142857142864,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.11764705882352941,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.8095238095238094,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.16326530612244897,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.3511830960216528,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.35,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.20408163265306123,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counting",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.5476190476190477,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.20202020202020202,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.27450980392156865,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.5316511584711222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.3136904761904762,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_execution",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.2619047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.017952657306398452,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.2631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.7058823529411765,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.47058823529411764,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "rebus",
+ "score": 0.17391304347826086,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.761904761904762,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.1285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6117647058823531,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.2584183673469388,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.21468253968253967,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.43333333333333335,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.1,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.5294117647058824,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.21999999999999997,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.35714285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.6714285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.9071428571428573,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.6499999999999998,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.7620689655172415,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.4714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.7586206896551724,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.5928571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.7999999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.7827586206896551,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.7785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.489655172413793,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "iq_test",
+ "score": 0.4620689655172413,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.7068965517241379,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.5677419354838709,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.7103448275862067,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.4428571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.7400000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.6038461538461539,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.6333333333333333,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.35000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.706896551724138,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.8421052631578949,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.6034482758620691,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.22857142857142862,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.2357142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.0642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.5,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.5428571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.39999999999999997,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.23571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.6071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.45,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.42857142857142855,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.67,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.875,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.7200000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8789473684210528,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.6800000000000002,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.07142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/SI/InternVL2_8B/summary_results.json b/static/eval_results/SI/InternVL2_8B/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..6626dca0ba52f31da57208eedcad8531070e052c
--- /dev/null
+++ b/static/eval_results/SI/InternVL2_8B/summary_results.json
@@ -0,0 +1,219 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 273,
+ "num_eval_samples": 4116,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.27650612401825575,
+ "micro_mean_score": 0.27119471729837735,
+ "missing_tasks": []
+ },
+ "open": {
+ "num_eval_tasks": 42,
+ "num_eval_samples": 813,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.39388373890935635,
+ "micro_mean_score": 0.4045510455104551,
+ "missing_tasks": []
+ },
+ "overall_score": 0.29215647267040246
+ },
+ "keyword_stats": {
+ "skills": {
+ "Text Recognition (OCR)": {
+ "count": 101,
+ "num_samples": 1687,
+ "tasks": [],
+ "average_score": 0.30220279568886643
+ },
+ "Object Recognition and Classification": {
+ "count": 172,
+ "num_samples": 2714,
+ "tasks": [],
+ "average_score": 0.2915702951202482
+ },
+ "Scene and Event Understanding": {
+ "count": 60,
+ "num_samples": 1004,
+ "tasks": [],
+ "average_score": 0.41603267498315427
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 91,
+ "num_samples": 1630,
+ "tasks": [],
+ "average_score": 0.24983605813271914
+ },
+ "Language Understanding and Generation": {
+ "count": 102,
+ "num_samples": 1713,
+ "tasks": [],
+ "average_score": 0.3284779417766259
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 46,
+ "num_samples": 897,
+ "tasks": [],
+ "average_score": 0.27396131593770284
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 78,
+ "num_samples": 1273,
+ "tasks": [],
+ "average_score": 0.21701915158341967
+ },
+ "Planning and Decision Making": {
+ "count": 23,
+ "num_samples": 356,
+ "tasks": [],
+ "average_score": 0.0592961015994038
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 38,
+ "num_samples": 654,
+ "tasks": [],
+ "average_score": 0.4403771552269444
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 10,
+ "num_samples": 170,
+ "tasks": [],
+ "average_score": 0.6521729323308272
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 67,
+ "num_samples": 1123,
+ "tasks": [],
+ "average_score": 0.22539102164423624
+ },
+ "Text-Based Images and Documents": {
+ "count": 53,
+ "num_samples": 847,
+ "tasks": [],
+ "average_score": 0.21516421271234623
+ },
+ "Photographs": {
+ "count": 83,
+ "num_samples": 1315,
+ "tasks": [],
+ "average_score": 0.4088467630174509
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 88,
+ "num_samples": 1524,
+ "tasks": [],
+ "average_score": 0.27187498646061353
+ },
+ "Artistic and Creative Content": {
+ "count": 22,
+ "num_samples": 389,
+ "tasks": [],
+ "average_score": 0.34383350461121587
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 2,
+ "num_samples": 30,
+ "tasks": [],
+ "average_score": 0.0503849634147267
+ }
+ },
+ "output_format": {
+ "structured_output": {
+ "count": 72,
+ "num_samples": 1121,
+ "tasks": [],
+ "average_score": 0.27991889529924496
+ },
+ "contextual_formatted_text": {
+ "count": 63,
+ "num_samples": 975,
+ "tasks": [],
+ "average_score": 0.25281668404704594
+ },
+ "exact_text": {
+ "count": 57,
+ "num_samples": 880,
+ "tasks": [],
+ "average_score": 0.2452385560845516
+ },
+ "numerical_data": {
+ "count": 39,
+ "num_samples": 694,
+ "tasks": [],
+ "average_score": 0.26248166960198344
+ },
+ "open_ended_output": {
+ "count": 51,
+ "num_samples": 991,
+ "tasks": [],
+ "average_score": 0.3417106670258814
+ },
+ "multiple_choice": {
+ "count": 33,
+ "num_samples": 567,
+ "tasks": [],
+ "average_score": 0.4334863789409244
+ }
+ },
+ "input_num": {
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.29215647267040246
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 41,
+ "num_samples": 644,
+ "tasks": [],
+ "average_score": 0.25646898023629483
+ },
+ "Planning": {
+ "count": 44,
+ "num_samples": 714,
+ "tasks": [],
+ "average_score": 0.09134825639389237
+ },
+ "Coding": {
+ "count": 16,
+ "num_samples": 244,
+ "tasks": [],
+ "average_score": 0.34736300770308126
+ },
+ "Perception": {
+ "count": 82,
+ "num_samples": 1321,
+ "tasks": [],
+ "average_score": 0.3784296942438538
+ },
+ "Metrics": {
+ "count": 3,
+ "num_samples": 45,
+ "tasks": [],
+ "average_score": 0.3253968253968254
+ },
+ "Science": {
+ "count": 22,
+ "num_samples": 469,
+ "tasks": [],
+ "average_score": 0.2912783917684807
+ },
+ "Knowledge": {
+ "count": 77,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.34366199611891174
+ },
+ "Mathematics": {
+ "count": 30,
+ "num_samples": 497,
+ "tasks": [],
+ "average_score": 0.23531351908862871
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/SI/InternVL2_8B/task_results.json b/static/eval_results/SI/InternVL2_8B/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d85f55b2fb96e1c3aac7ec219d21c83831ccb8c
--- /dev/null
+++ b/static/eval_results/SI/InternVL2_8B/task_results.json
@@ -0,0 +1,2207 @@
+[
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.32653061224489793,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.3705584548643767,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.029341355400881983,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "dvqa",
+ "score": 0.8421052631578947,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.46961677159776144,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.15333333333333338,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.6207965121029526,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.31111111111111106,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.7521008403361344,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.3684210526315789,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.5010526315789474,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.46825396825396826,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.4594839638482036,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "figureqa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.3337894736842106,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.6858571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.3684210526315789,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.41428571428571426,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_parity",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.6875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.4325396825396826,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "algebra",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.9280335714285712,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.47985714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.011813648192052783,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.5400999999999998,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "iconqa",
+ "score": 0.15789473684210525,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.3777777777777777,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.05555555555555555,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.1,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.1142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.05714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.05714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.09999999999999999,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.8761904761904761,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.20987470834518032,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.3,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.061224489795918366,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.7894736842105263,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counting",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.5238095238095238,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.1717171717171717,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.16176470588235295,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.39826242128287875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.008928571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_execution",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.00602410597959912,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.15789473684210525,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.29411764705882354,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.1,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.29411764705882354,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "rebus",
+ "score": 0.13043478260869565,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.6190476190476192,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.05,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.32941176470588246,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.16096938775510203,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.20918367346938774,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.152,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.08571428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.5857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.8071428571428573,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.5285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.4620689655172414,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.4071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.39655172413793105,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.4928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.5413793103448274,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.3586206896551723,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.6142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.22068965517241376,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "iq_test",
+ "score": 0.14827586206896554,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.4103448275862069,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.6064516129032257,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.4551724137931034,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.09285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.41333333333333333,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.676923076923077,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.5111111111111111,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.2,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.37241379310344824,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.805263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.2379310344827586,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.03571428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.31428571428571433,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.014285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.3214285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.028571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.1285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.17857142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.4428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.34285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.3785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.66,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.86,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.6550000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8631578947368422,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.5650000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.07142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/SI/Llama_3_2_11B/summary_results.json b/static/eval_results/SI/Llama_3_2_11B/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1f38c1b0b9723c7ad1eb5965264683efa3b48f73
--- /dev/null
+++ b/static/eval_results/SI/Llama_3_2_11B/summary_results.json
@@ -0,0 +1,219 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 273,
+ "num_eval_samples": 4116,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.20789144960796493,
+ "micro_mean_score": 0.20163641703273802,
+ "missing_tasks": []
+ },
+ "open": {
+ "num_eval_tasks": 42,
+ "num_eval_samples": 813,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.3861125858565788,
+ "micro_mean_score": 0.4130381303813038,
+ "missing_tasks": []
+ },
+ "overall_score": 0.2316542677744468
+ },
+ "keyword_stats": {
+ "skills": {
+ "Text Recognition (OCR)": {
+ "count": 101,
+ "num_samples": 1687,
+ "tasks": [],
+ "average_score": 0.20716804318138016
+ },
+ "Object Recognition and Classification": {
+ "count": 172,
+ "num_samples": 2714,
+ "tasks": [],
+ "average_score": 0.2546845731733449
+ },
+ "Scene and Event Understanding": {
+ "count": 60,
+ "num_samples": 1004,
+ "tasks": [],
+ "average_score": 0.37246318118748967
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 91,
+ "num_samples": 1630,
+ "tasks": [],
+ "average_score": 0.14653430680774066
+ },
+ "Language Understanding and Generation": {
+ "count": 102,
+ "num_samples": 1713,
+ "tasks": [],
+ "average_score": 0.25994005315432245
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 46,
+ "num_samples": 897,
+ "tasks": [],
+ "average_score": 0.21893599730050764
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 78,
+ "num_samples": 1273,
+ "tasks": [],
+ "average_score": 0.15806381880426276
+ },
+ "Planning and Decision Making": {
+ "count": 23,
+ "num_samples": 356,
+ "tasks": [],
+ "average_score": 0.058403715092363084
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 38,
+ "num_samples": 654,
+ "tasks": [],
+ "average_score": 0.39649168256429074
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 10,
+ "num_samples": 170,
+ "tasks": [],
+ "average_score": 0.5728796992481204
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 67,
+ "num_samples": 1123,
+ "tasks": [],
+ "average_score": 0.18967604731477847
+ },
+ "Text-Based Images and Documents": {
+ "count": 53,
+ "num_samples": 847,
+ "tasks": [],
+ "average_score": 0.13775107230512224
+ },
+ "Photographs": {
+ "count": 83,
+ "num_samples": 1315,
+ "tasks": [],
+ "average_score": 0.3387317156024255
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 88,
+ "num_samples": 1524,
+ "tasks": [],
+ "average_score": 0.1970899659349296
+ },
+ "Artistic and Creative Content": {
+ "count": 22,
+ "num_samples": 389,
+ "tasks": [],
+ "average_score": 0.3275861238187186
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 2,
+ "num_samples": 30,
+ "tasks": [],
+ "average_score": 0.14822411270107955
+ }
+ },
+ "output_format": {
+ "structured_output": {
+ "count": 72,
+ "num_samples": 1121,
+ "tasks": [],
+ "average_score": 0.24509462859331077
+ },
+ "contextual_formatted_text": {
+ "count": 63,
+ "num_samples": 975,
+ "tasks": [],
+ "average_score": 0.15123880546660726
+ },
+ "exact_text": {
+ "count": 57,
+ "num_samples": 880,
+ "tasks": [],
+ "average_score": 0.16571305203663964
+ },
+ "numerical_data": {
+ "count": 39,
+ "num_samples": 694,
+ "tasks": [],
+ "average_score": 0.16301171403498463
+ },
+ "open_ended_output": {
+ "count": 51,
+ "num_samples": 991,
+ "tasks": [],
+ "average_score": 0.34463240030392384
+ },
+ "multiple_choice": {
+ "count": 33,
+ "num_samples": 567,
+ "tasks": [],
+ "average_score": 0.3762691853600945
+ }
+ },
+ "input_num": {
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.23165426777444673
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 41,
+ "num_samples": 644,
+ "tasks": [],
+ "average_score": 0.23423754995839735
+ },
+ "Planning": {
+ "count": 44,
+ "num_samples": 714,
+ "tasks": [],
+ "average_score": 0.09595984705908096
+ },
+ "Coding": {
+ "count": 16,
+ "num_samples": 244,
+ "tasks": [],
+ "average_score": 0.14131944444444444
+ },
+ "Perception": {
+ "count": 82,
+ "num_samples": 1321,
+ "tasks": [],
+ "average_score": 0.2740778723883188
+ },
+ "Metrics": {
+ "count": 3,
+ "num_samples": 45,
+ "tasks": [],
+ "average_score": 0.22857142857142856
+ },
+ "Science": {
+ "count": 22,
+ "num_samples": 469,
+ "tasks": [],
+ "average_score": 0.18716549835825297
+ },
+ "Knowledge": {
+ "count": 77,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.33493936008655223
+ },
+ "Mathematics": {
+ "count": 30,
+ "num_samples": 497,
+ "tasks": [],
+ "average_score": 0.12719796356144183
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/SI/Llama_3_2_11B/task_results.json b/static/eval_results/SI/Llama_3_2_11B/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..458370c2887069e76fc00bfb6a713fd2ce451d4b
--- /dev/null
+++ b/static/eval_results/SI/Llama_3_2_11B/task_results.json
@@ -0,0 +1,2207 @@
+[
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.4387755102040816,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.58,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.08577420938532272,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.010733939687873404,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "dvqa",
+ "score": 0.42105263157894735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.11976895147024216,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.8376623376623374,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.10497817448547724,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.0933333333333334,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.2431787841758548,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.8151260504201682,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.42105263157894735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.009473684210526306,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.4170800923924453,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "figureqa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.02831578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.05935714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.05263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.5857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_parity",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.53125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.19642857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.5912698412698412,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "algebra",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.6875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.671125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.23564285714285718,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.0808793106923892,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.3097052631578948,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "iconqa",
+ "score": 0.2631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.49107142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.4761904761904762,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.38392857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.28888888888888886,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.2285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.27142857142857146,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.2285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.08027210884353742,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.2874256645177423,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.11764705882352941,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.6904761904761905,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.04081632653061224,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.0452954818941379,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.15,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.04081632653061224,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.2631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counting",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.10101010101010101,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.25735294117647056,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.005791505791505792,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.5361188092942067,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.002976190476190476,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_execution",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.2380952380952381,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.0013736263736263737,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_comparison",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.2894736842105263,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.47058823529411764,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "rebus",
+ "score": 0.13043478260869565,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.7142857142857144,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6705882352941177,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.16666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.16071428571428573,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.09835600907029478,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.1,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.03571428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.5642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.33571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.15,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.5758620689655174,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.5571428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.5620689655172414,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.4428571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.6275862068965519,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.5586206896551724,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.5928571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.5000000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "iq_test",
+ "score": 0.3482758620689655,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.4724137931034484,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.38064516129032255,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.20344827586206896,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.7,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.4666666666666667,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.4115384615384616,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.23333333333333328,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.4642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.3551724137931035,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.7263157894736842,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.410344827586207,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.16428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.1142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.021428571428571432,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.014285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.19999999999999998,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.15,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.03571428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.07857142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.028571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.3,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.74,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.18421052631578955,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.9,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.7,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8473684210526319,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.8200000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.14285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/SI/MiniCPM_v2.6/summary_results.json b/static/eval_results/SI/MiniCPM_v2.6/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1e91edbcb5931b29cb88fcb6d0990c607e10cb5f
--- /dev/null
+++ b/static/eval_results/SI/MiniCPM_v2.6/summary_results.json
@@ -0,0 +1,219 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 273,
+ "num_eval_samples": 4116,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.23230765810722817,
+ "micro_mean_score": 0.22684118052665975,
+ "missing_tasks": []
+ },
+ "open": {
+ "num_eval_tasks": 42,
+ "num_eval_samples": 813,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.4360655066213874,
+ "micro_mean_score": 0.4588560885608856,
+ "missing_tasks": []
+ },
+ "overall_score": 0.2594753712424494
+ },
+ "keyword_stats": {
+ "skills": {
+ "Text Recognition (OCR)": {
+ "count": 101,
+ "num_samples": 1687,
+ "tasks": [],
+ "average_score": 0.26814713591233313
+ },
+ "Object Recognition and Classification": {
+ "count": 172,
+ "num_samples": 2714,
+ "tasks": [],
+ "average_score": 0.2657183000752527
+ },
+ "Scene and Event Understanding": {
+ "count": 60,
+ "num_samples": 1004,
+ "tasks": [],
+ "average_score": 0.3977302205205499
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 91,
+ "num_samples": 1630,
+ "tasks": [],
+ "average_score": 0.18352505380246076
+ },
+ "Language Understanding and Generation": {
+ "count": 102,
+ "num_samples": 1713,
+ "tasks": [],
+ "average_score": 0.3045977370408878
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 46,
+ "num_samples": 897,
+ "tasks": [],
+ "average_score": 0.2244713686485571
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 78,
+ "num_samples": 1273,
+ "tasks": [],
+ "average_score": 0.17375496033997198
+ },
+ "Planning and Decision Making": {
+ "count": 23,
+ "num_samples": 356,
+ "tasks": [],
+ "average_score": 0.06087615859328559
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 38,
+ "num_samples": 654,
+ "tasks": [],
+ "average_score": 0.45156722842924535
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 10,
+ "num_samples": 170,
+ "tasks": [],
+ "average_score": 0.660718045112782
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 67,
+ "num_samples": 1123,
+ "tasks": [],
+ "average_score": 0.21066692306852683
+ },
+ "Text-Based Images and Documents": {
+ "count": 53,
+ "num_samples": 847,
+ "tasks": [],
+ "average_score": 0.17128318830807052
+ },
+ "Photographs": {
+ "count": 83,
+ "num_samples": 1315,
+ "tasks": [],
+ "average_score": 0.3681846956052881
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 88,
+ "num_samples": 1524,
+ "tasks": [],
+ "average_score": 0.23021362338817897
+ },
+ "Artistic and Creative Content": {
+ "count": 22,
+ "num_samples": 389,
+ "tasks": [],
+ "average_score": 0.34994481629202306
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 2,
+ "num_samples": 30,
+ "tasks": [],
+ "average_score": 0.012567281814686655
+ }
+ },
+ "output_format": {
+ "structured_output": {
+ "count": 72,
+ "num_samples": 1121,
+ "tasks": [],
+ "average_score": 0.20284349423013687
+ },
+ "contextual_formatted_text": {
+ "count": 63,
+ "num_samples": 975,
+ "tasks": [],
+ "average_score": 0.23679437883858215
+ },
+ "exact_text": {
+ "count": 57,
+ "num_samples": 880,
+ "tasks": [],
+ "average_score": 0.21540007432647457
+ },
+ "numerical_data": {
+ "count": 39,
+ "num_samples": 694,
+ "tasks": [],
+ "average_score": 0.2036075191422558
+ },
+ "open_ended_output": {
+ "count": 51,
+ "num_samples": 991,
+ "tasks": [],
+ "average_score": 0.3711731498662282
+ },
+ "multiple_choice": {
+ "count": 33,
+ "num_samples": 567,
+ "tasks": [],
+ "average_score": 0.39586776859504136
+ }
+ },
+ "input_num": {
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.25947537124244935
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 41,
+ "num_samples": 644,
+ "tasks": [],
+ "average_score": 0.21340553041678637
+ },
+ "Planning": {
+ "count": 44,
+ "num_samples": 714,
+ "tasks": [],
+ "average_score": 0.07517089101065139
+ },
+ "Coding": {
+ "count": 16,
+ "num_samples": 244,
+ "tasks": [],
+ "average_score": 0.20497125933706817
+ },
+ "Perception": {
+ "count": 82,
+ "num_samples": 1321,
+ "tasks": [],
+ "average_score": 0.3620762837308124
+ },
+ "Metrics": {
+ "count": 3,
+ "num_samples": 45,
+ "tasks": [],
+ "average_score": 0.3507936507936508
+ },
+ "Science": {
+ "count": 22,
+ "num_samples": 469,
+ "tasks": [],
+ "average_score": 0.25260048981169975
+ },
+ "Knowledge": {
+ "count": 77,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.33417132133610217
+ },
+ "Mathematics": {
+ "count": 30,
+ "num_samples": 497,
+ "tasks": [],
+ "average_score": 0.14556723677922526
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/SI/MiniCPM_v2.6/task_results.json b/static/eval_results/SI/MiniCPM_v2.6/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..4fd3adbb982b548e0ba82627ef2680b85d85542a
--- /dev/null
+++ b/static/eval_results/SI/MiniCPM_v2.6/task_results.json
@@ -0,0 +1,2207 @@
+[
+ {
+ "name": "location_vqa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.29411764705882354,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.02040816326530612,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.30216436328431096,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.2619047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.3,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.11764705882352941,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.031192609502066617,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_execution",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.08571428571428573,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.7261904761904762,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counting",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.1414141414141414,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.17142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "rebus",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.03214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.23529411764705882,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.03365154769951505,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.1786994367639529,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.6785714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.23684210526315788,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.380952380952381,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.26673684210526316,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.43479972951988266,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.38888888888888884,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.5208819022936791,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.09114179383067332,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.33035714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "dvqa",
+ "score": 0.8421052631578947,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.3512276296925473,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.8125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.7467532467532468,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.3992857142857144,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "algebra",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "iconqa",
+ "score": 0.3157894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.16192857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.8309857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.18367346938775508,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.26,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.02513456362937331,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.49999999999999994,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.6890756302521008,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_parity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.28888888888888886,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.4542105263157895,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.17992339728368958,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.42105263157894735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.24133333333333346,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "figureqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.5079713114698053,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.0750526315789474,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.0071428571428571435,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.6904761904761905,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.7058823529411766,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.17647058823529413,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.2202380952380952,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.09450113378684807,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.05555555555555555,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.12857142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.042857142857142864,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.18571428571428572,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.042857142857142864,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.5896551724137933,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.4071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.4689655172413793,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "iq_test",
+ "score": 0.2586206896551724,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.7785714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.22142857142857145,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.7103448275862069,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.41111111111111115,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.5793103448275863,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.5586206896551725,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.5206896551724138,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.393103448275862,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.5400000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.49285714285714277,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.4206896551724137,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.06428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.1928571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.6923076923076923,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.4928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.09,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.46428571428571425,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.5322580645161291,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.5827586206896552,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.768421052631579,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.5857142857142856,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.2857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.3928571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.35,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.16428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.37857142857142856,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.20714285714285716,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.2571428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.42142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.14285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.37857142857142856,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.07142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8578947368421055,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.555,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.7100000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.48999999999999994,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.7349999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/SI/Molmo_72B/summary_results.json b/static/eval_results/SI/Molmo_72B/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..67df8a5ceb069e1926824e120e649ddeba93073b
--- /dev/null
+++ b/static/eval_results/SI/Molmo_72B/summary_results.json
@@ -0,0 +1,223 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 270,
+ "num_eval_samples": 4073,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.36480000609384927,
+ "micro_mean_score": 0.36205779758110807,
+ "missing_tasks": [
+ "planning_screenshot_termes",
+ "table_understanding",
+ "MMSoc_Misinformation_PolitiFact"
+ ]
+ },
+ "open": {
+ "num_eval_tasks": 42,
+ "num_eval_samples": 813,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.4465682063915481,
+ "micro_mean_score": 0.4850553505535054,
+ "missing_tasks": []
+ },
+ "overall_score": 0.3758072638262318
+ },
+ "keyword_stats": {
+ "skills": {
+ "Text Recognition (OCR)": {
+ "count": 101,
+ "num_samples": 1672,
+ "tasks": [],
+ "average_score": 0.41462128751753047
+ },
+ "Object Recognition and Classification": {
+ "count": 172,
+ "num_samples": 2714,
+ "tasks": [],
+ "average_score": 0.4223762317042425
+ },
+ "Scene and Event Understanding": {
+ "count": 60,
+ "num_samples": 1004,
+ "tasks": [],
+ "average_score": 0.5756984198310193
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 91,
+ "num_samples": 1615,
+ "tasks": [],
+ "average_score": 0.2983397150768741
+ },
+ "Language Understanding and Generation": {
+ "count": 102,
+ "num_samples": 1698,
+ "tasks": [],
+ "average_score": 0.4110431137367615
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 46,
+ "num_samples": 897,
+ "tasks": [],
+ "average_score": 0.3070615049173117
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 78,
+ "num_samples": 1257,
+ "tasks": [],
+ "average_score": 0.29197652844726363
+ },
+ "Planning and Decision Making": {
+ "count": 23,
+ "num_samples": 340,
+ "tasks": [],
+ "average_score": 0.07825953913967484
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 38,
+ "num_samples": 639,
+ "tasks": [],
+ "average_score": 0.445412976139552
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 10,
+ "num_samples": 155,
+ "tasks": [],
+ "average_score": 0.5953120300751881
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 67,
+ "num_samples": 1107,
+ "tasks": [],
+ "average_score": 0.3754692399771127
+ },
+ "Text-Based Images and Documents": {
+ "count": 53,
+ "num_samples": 847,
+ "tasks": [],
+ "average_score": 0.24743187516175363
+ },
+ "Photographs": {
+ "count": 83,
+ "num_samples": 1300,
+ "tasks": [],
+ "average_score": 0.47052754190598134
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 88,
+ "num_samples": 1509,
+ "tasks": [],
+ "average_score": 0.35176591065677537
+ },
+ "Artistic and Creative Content": {
+ "count": 22,
+ "num_samples": 389,
+ "tasks": [],
+ "average_score": 0.4405271103381682
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 2,
+ "num_samples": 30,
+ "tasks": [],
+ "average_score": 0.18757766329699532
+ }
+ },
+ "output_format": {
+ "structured_output": {
+ "count": 72,
+ "num_samples": 1105,
+ "tasks": [],
+ "average_score": 0.3670439889863054
+ },
+ "contextual_formatted_text": {
+ "count": 63,
+ "num_samples": 975,
+ "tasks": [],
+ "average_score": 0.3048441329189725
+ },
+ "exact_text": {
+ "count": 57,
+ "num_samples": 880,
+ "tasks": [],
+ "average_score": 0.36443166533642163
+ },
+ "numerical_data": {
+ "count": 39,
+ "num_samples": 679,
+ "tasks": [],
+ "average_score": 0.3342330361070466
+ },
+ "open_ended_output": {
+ "count": 51,
+ "num_samples": 991,
+ "tasks": [],
+ "average_score": 0.4120820025247545
+ },
+ "multiple_choice": {
+ "count": 33,
+ "num_samples": 552,
+ "tasks": [],
+ "average_score": 0.5421225239407056
+ }
+ },
+ "input_num": {
+ "1-image": {
+ "count": 315,
+ "num_samples": 5182,
+ "tasks": [],
+ "average_score": 0.3757024328002091
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 41,
+ "num_samples": 644,
+ "tasks": [],
+ "average_score": 0.5042591723808818
+ },
+ "Planning": {
+ "count": 44,
+ "num_samples": 698,
+ "tasks": [],
+ "average_score": 0.13015529062282669
+ },
+ "Coding": {
+ "count": 16,
+ "num_samples": 244,
+ "tasks": [],
+ "average_score": 0.2582151610644257
+ },
+ "Perception": {
+ "count": 82,
+ "num_samples": 1306,
+ "tasks": [],
+ "average_score": 0.4704848349431393
+ },
+ "Metrics": {
+ "count": 3,
+ "num_samples": 45,
+ "tasks": [],
+ "average_score": 0.6714285714285714
+ },
+ "Science": {
+ "count": 22,
+ "num_samples": 469,
+ "tasks": [],
+ "average_score": 0.3557374102316002
+ },
+ "Knowledge": {
+ "count": 77,
+ "num_samples": 1279,
+ "tasks": [],
+ "average_score": 0.39648868632862583
+ },
+ "Mathematics": {
+ "count": 30,
+ "num_samples": 497,
+ "tasks": [],
+ "average_score": 0.2954490282663994
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/SI/Molmo_72B/task_results.json b/static/eval_results/SI/Molmo_72B/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..5afd5f91add75e7356970c4840d0ccf634eb7ede
--- /dev/null
+++ b/static/eval_results/SI/Molmo_72B/task_results.json
@@ -0,0 +1,2207 @@
+[
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.3,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.09950000000000002,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.64,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.3362778655358422,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.1608696123082764,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "dvqa",
+ "score": 0.8421052631578947,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.5787232350283793,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.8246753246753247,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.2665355246878068,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.31399999999999995,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.31111111111111106,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.8529411764705882,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.6192105263157894,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.7539682539682541,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.4786650057305392,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "figureqa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.530263157894737,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.6355000000000001,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_parity",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.6468253968253969,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.7636999999999999,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.5474285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.21567342533242703,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.7032473684210526,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "iconqa",
+ "score": 0.21052631578947367,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.6339285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.47619047619047616,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.5555555555555555,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.17142857142857146,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.042857142857142864,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.047619047619047616,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.5102040816326531,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.5583866383665127,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.028571428571428574,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.24285714285714288,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.020673971469919374,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.8869047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.1836734693877551,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counting",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.24242424242424243,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.39558823529411763,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.32955157882083963,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.24404761904761907,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_execution",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.030696230874143155,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.10526315789473684,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.29411764705882354,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "rebus",
+ "score": 0.21739130434782608,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.7619047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6000000000000001,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.9047619047619048,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.2074829931972789,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.3755385487528345,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.16666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.5294117647058824,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.21052631578947367,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.23214285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "algebra",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.09073936194509774,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.18571428571428572,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 0
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 0
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 0
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.0071428571428571435,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.6857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.7400000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.54,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.7000000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.905263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.5149999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.35714285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.05,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.2857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.6285714285714284,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.7000000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.7071428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.7241379310344828,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.5285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.6862068965517241,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.5214285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.6862068965517241,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.7827586206896552,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.7071428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.5310344827586206,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "iq_test",
+ "score": 0.25862068965517243,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.6827586206896552,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.629032258064516,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.42758620689655175,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.09285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.6533333333333334,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.7461538461538463,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.3111111111111111,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.27142857142857146,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.6275862068965516,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.8263157894736842,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.4827586206896552,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.0642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.2857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.14285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.049999999999999996,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.028571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.08571428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/SI/Molmo_7B_D/summary_results.json b/static/eval_results/SI/Molmo_7B_D/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..a75e051b5c4fd6a1eb5c021faa7ba3bfd17d761d
--- /dev/null
+++ b/static/eval_results/SI/Molmo_7B_D/summary_results.json
@@ -0,0 +1,221 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 272,
+ "num_eval_samples": 4102,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.2098088446992518,
+ "micro_mean_score": 0.20550929661464645,
+ "missing_tasks": [
+ "MMSoc_Misinformation_PolitiFact"
+ ]
+ },
+ "open": {
+ "num_eval_tasks": 42,
+ "num_eval_samples": 813,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.35697926179118733,
+ "micro_mean_score": 0.38936039360393604,
+ "missing_tasks": []
+ },
+ "overall_score": 0.22949405972428777
+ },
+ "keyword_stats": {
+ "skills": {
+ "Text Recognition (OCR)": {
+ "count": 101,
+ "num_samples": 1687,
+ "tasks": [],
+ "average_score": 0.2239160707791646
+ },
+ "Object Recognition and Classification": {
+ "count": 172,
+ "num_samples": 2714,
+ "tasks": [],
+ "average_score": 0.24958564675030656
+ },
+ "Scene and Event Understanding": {
+ "count": 60,
+ "num_samples": 1004,
+ "tasks": [],
+ "average_score": 0.35830528296805764
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 91,
+ "num_samples": 1630,
+ "tasks": [],
+ "average_score": 0.17259103199957743
+ },
+ "Language Understanding and Generation": {
+ "count": 102,
+ "num_samples": 1698,
+ "tasks": [],
+ "average_score": 0.25685172601108597
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 46,
+ "num_samples": 897,
+ "tasks": [],
+ "average_score": 0.19244928377978027
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 78,
+ "num_samples": 1273,
+ "tasks": [],
+ "average_score": 0.1939605648275455
+ },
+ "Planning and Decision Making": {
+ "count": 23,
+ "num_samples": 356,
+ "tasks": [],
+ "average_score": 0.035588894400059294
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 38,
+ "num_samples": 639,
+ "tasks": [],
+ "average_score": 0.32356781790614975
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 10,
+ "num_samples": 155,
+ "tasks": [],
+ "average_score": 0.4433947368421053
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 67,
+ "num_samples": 1123,
+ "tasks": [],
+ "average_score": 0.18796442406686825
+ },
+ "Text-Based Images and Documents": {
+ "count": 53,
+ "num_samples": 847,
+ "tasks": [],
+ "average_score": 0.1327652104313917
+ },
+ "Photographs": {
+ "count": 83,
+ "num_samples": 1300,
+ "tasks": [],
+ "average_score": 0.323282724310645
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 88,
+ "num_samples": 1524,
+ "tasks": [],
+ "average_score": 0.21356852314768052
+ },
+ "Artistic and Creative Content": {
+ "count": 22,
+ "num_samples": 389,
+ "tasks": [],
+ "average_score": 0.3106093738160722
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 2,
+ "num_samples": 30,
+ "tasks": [],
+ "average_score": 0.09043432702433757
+ }
+ },
+ "output_format": {
+ "structured_output": {
+ "count": 72,
+ "num_samples": 1121,
+ "tasks": [],
+ "average_score": 0.21610753722787088
+ },
+ "contextual_formatted_text": {
+ "count": 63,
+ "num_samples": 975,
+ "tasks": [],
+ "average_score": 0.17305260714177756
+ },
+ "exact_text": {
+ "count": 57,
+ "num_samples": 880,
+ "tasks": [],
+ "average_score": 0.17907829453546903
+ },
+ "numerical_data": {
+ "count": 39,
+ "num_samples": 694,
+ "tasks": [],
+ "average_score": 0.22086240998395923
+ },
+ "open_ended_output": {
+ "count": 51,
+ "num_samples": 991,
+ "tasks": [],
+ "average_score": 0.324079404512755
+ },
+ "multiple_choice": {
+ "count": 33,
+ "num_samples": 552,
+ "tasks": [],
+ "average_score": 0.3169618260527351
+ }
+ },
+ "input_num": {
+ "1-image": {
+ "count": 315,
+ "num_samples": 5213,
+ "tasks": [],
+ "average_score": 0.22943156697817663
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 41,
+ "num_samples": 644,
+ "tasks": [],
+ "average_score": 0.27184002856754413
+ },
+ "Planning": {
+ "count": 44,
+ "num_samples": 714,
+ "tasks": [],
+ "average_score": 0.06424366688759706
+ },
+ "Coding": {
+ "count": 16,
+ "num_samples": 244,
+ "tasks": [],
+ "average_score": 0.1158110119047619
+ },
+ "Perception": {
+ "count": 82,
+ "num_samples": 1321,
+ "tasks": [],
+ "average_score": 0.30311570603428784
+ },
+ "Metrics": {
+ "count": 3,
+ "num_samples": 45,
+ "tasks": [],
+ "average_score": 0.3619047619047619
+ },
+ "Science": {
+ "count": 22,
+ "num_samples": 469,
+ "tasks": [],
+ "average_score": 0.173722800705029
+ },
+ "Knowledge": {
+ "count": 77,
+ "num_samples": 1279,
+ "tasks": [],
+ "average_score": 0.2787344822161389
+ },
+ "Mathematics": {
+ "count": 30,
+ "num_samples": 497,
+ "tasks": [],
+ "average_score": 0.1740048655548875
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/SI/Molmo_7B_D/task_results.json b/static/eval_results/SI/Molmo_7B_D/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..0bbe3028db39138ab0ace362dae451ffeb4d4922
--- /dev/null
+++ b/static/eval_results/SI/Molmo_7B_D/task_results.json
@@ -0,0 +1,2207 @@
+[
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.3877551020408164,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.313862400588624,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.038011511191532274,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "dvqa",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.41069776234840943,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.7662337662337662,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.02719542601438354,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.48066666666666674,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.6552623858210546,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.08888888888888888,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.8277310924369747,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.3157894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.5805263157894738,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.43650793650793646,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.26186238411043455,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "figureqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.38000000000000006,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.47507142857142853,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.33571428571428574,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_parity",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.21875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.5634920634920635,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "algebra",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.8125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.6044642857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.42121428571428565,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.060991464591085905,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.3600894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "iconqa",
+ "score": 0.10526315789473684,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.34523809523809523,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.2767857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.4888888888888889,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.05555555555555555,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.042857142857142864,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.2571428571428572,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.24285714285714288,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.1,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.01677018633540373,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.7261904761904762,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.12244897959183673,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.015030224651396001,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.3,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.04081632653061224,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.5789473684210527,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counting",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.16666666666666669,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.10101010101010101,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.13725490196078433,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.10903008480749887,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_execution",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.014172871487032616,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.09649122807017543,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.17647058823529413,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.21904761904761907,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "rebus",
+ "score": 0.043478260869565216,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.619047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.08571428571428572,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.5411764705882353,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.15858843537414966,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.1503684807256236,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.29411764705882354,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.10526315789473684,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 0
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.15,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.2857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.43571428571428567,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.1285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.5214285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.3586206896551723,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.45714285714285713,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.5896551724137932,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.4714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.6517241379310346,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.703448275862069,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.5214285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.35862068965517246,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "iq_test",
+ "score": 0.19999999999999998,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.4379310344827586,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.5967741935483872,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.23103448275862065,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.2,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.4,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.6730769230769231,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.05555555555555555,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.09999999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.31379310344827577,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.7789473684210527,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.4,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.07142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.35714285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.049999999999999996,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.0071428571428571435,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.0071428571428571435,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.4928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.7150000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.71,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.6849999999999998,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8789473684210528,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.255,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.6428571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/SI/NVLM/summary_results.json b/static/eval_results/SI/NVLM/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..45c4dacafa84da381f3a9b804029c98426e57384
--- /dev/null
+++ b/static/eval_results/SI/NVLM/summary_results.json
@@ -0,0 +1,219 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 273,
+ "num_eval_samples": 4116,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.32989872890926025,
+ "micro_mean_score": 0.32315683713111915,
+ "missing_tasks": []
+ },
+ "open": {
+ "num_eval_tasks": 42,
+ "num_eval_samples": 813,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.4469349818134809,
+ "micro_mean_score": 0.4881303813038132,
+ "missing_tasks": []
+ },
+ "overall_score": 0.34550356262982296
+ },
+ "keyword_stats": {
+ "skills": {
+ "Text Recognition (OCR)": {
+ "count": 101,
+ "num_samples": 1687,
+ "tasks": [],
+ "average_score": 0.3943476764428869
+ },
+ "Object Recognition and Classification": {
+ "count": 172,
+ "num_samples": 2714,
+ "tasks": [],
+ "average_score": 0.3359094293956291
+ },
+ "Scene and Event Understanding": {
+ "count": 60,
+ "num_samples": 1004,
+ "tasks": [],
+ "average_score": 0.46386896656934745
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 91,
+ "num_samples": 1630,
+ "tasks": [],
+ "average_score": 0.30043411704099793
+ },
+ "Language Understanding and Generation": {
+ "count": 102,
+ "num_samples": 1713,
+ "tasks": [],
+ "average_score": 0.38986101015677044
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 46,
+ "num_samples": 897,
+ "tasks": [],
+ "average_score": 0.3152573721587561
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 78,
+ "num_samples": 1273,
+ "tasks": [],
+ "average_score": 0.26907670581189963
+ },
+ "Planning and Decision Making": {
+ "count": 23,
+ "num_samples": 356,
+ "tasks": [],
+ "average_score": 0.07615011020932495
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 38,
+ "num_samples": 654,
+ "tasks": [],
+ "average_score": 0.45915496990325566
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 10,
+ "num_samples": 170,
+ "tasks": [],
+ "average_score": 0.6521954887218044
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 67,
+ "num_samples": 1123,
+ "tasks": [],
+ "average_score": 0.2814148428882822
+ },
+ "Text-Based Images and Documents": {
+ "count": 53,
+ "num_samples": 847,
+ "tasks": [],
+ "average_score": 0.30070480033875985
+ },
+ "Photographs": {
+ "count": 83,
+ "num_samples": 1315,
+ "tasks": [],
+ "average_score": 0.4332099707344069
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 88,
+ "num_samples": 1524,
+ "tasks": [],
+ "average_score": 0.32094439294995036
+ },
+ "Artistic and Creative Content": {
+ "count": 22,
+ "num_samples": 389,
+ "tasks": [],
+ "average_score": 0.4387718807206103
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 2,
+ "num_samples": 30,
+ "tasks": [],
+ "average_score": 0.09447890526012262
+ }
+ },
+ "output_format": {
+ "structured_output": {
+ "count": 72,
+ "num_samples": 1121,
+ "tasks": [],
+ "average_score": 0.34135355449546323
+ },
+ "contextual_formatted_text": {
+ "count": 63,
+ "num_samples": 975,
+ "tasks": [],
+ "average_score": 0.3215154320779893
+ },
+ "exact_text": {
+ "count": 57,
+ "num_samples": 880,
+ "tasks": [],
+ "average_score": 0.29287492253780084
+ },
+ "numerical_data": {
+ "count": 39,
+ "num_samples": 694,
+ "tasks": [],
+ "average_score": 0.28793758479482745
+ },
+ "open_ended_output": {
+ "count": 51,
+ "num_samples": 991,
+ "tasks": [],
+ "average_score": 0.3828322321439372
+ },
+ "multiple_choice": {
+ "count": 33,
+ "num_samples": 567,
+ "tasks": [],
+ "average_score": 0.5016004197822379
+ }
+ },
+ "input_num": {
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.345503562629823
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 41,
+ "num_samples": 644,
+ "tasks": [],
+ "average_score": 0.352859881186271
+ },
+ "Planning": {
+ "count": 44,
+ "num_samples": 714,
+ "tasks": [],
+ "average_score": 0.1252138046141793
+ },
+ "Coding": {
+ "count": 16,
+ "num_samples": 244,
+ "tasks": [],
+ "average_score": 0.37153871965452856
+ },
+ "Perception": {
+ "count": 82,
+ "num_samples": 1321,
+ "tasks": [],
+ "average_score": 0.45079588183469294
+ },
+ "Metrics": {
+ "count": 3,
+ "num_samples": 45,
+ "tasks": [],
+ "average_score": 0.3047619047619048
+ },
+ "Science": {
+ "count": 22,
+ "num_samples": 469,
+ "tasks": [],
+ "average_score": 0.3518857602487131
+ },
+ "Knowledge": {
+ "count": 77,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.37572531212341936
+ },
+ "Mathematics": {
+ "count": 30,
+ "num_samples": 497,
+ "tasks": [],
+ "average_score": 0.2786818799518423
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/SI/NVLM/task_results.json b/static/eval_results/SI/NVLM/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e8e5b4debdef91683ae1d1543ab9361ef5d55cb9
--- /dev/null
+++ b/static/eval_results/SI/NVLM/task_results.json
@@ -0,0 +1,2207 @@
+[
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.21666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.5408163265306122,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.54,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.041666666666666664,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.3505208453866322,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.046100667663102404,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "dvqa",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.4733815589511575,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.8896103896103894,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.05655934095486302,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.5693333333333332,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.3056056231994156,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.3777777777777777,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.8823529411764708,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.5594736842105263,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.7142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.40264995876467374,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "figureqa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.5572631578947368,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.3831428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.21052631578947367,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.5142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_parity",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.8125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.5803571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.8412698412698412,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "algebra",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.8834571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.4170714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.15524366362546105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.39636315789473686,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "iconqa",
+ "score": 0.15789473684210525,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.619047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.44642857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.35555555555555557,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.042857142857142864,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.11428571428571428,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.15714285714285717,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.028571428571428574,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.05714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.2914684343291797,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.01873586319222244,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.8761904761904761,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.24489795918367346,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.1238698998933831,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.12244897959183673,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counting",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.20202020202020202,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.2901960784313726,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.35170464659390166,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.1488095238095238,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_execution",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.022876924032503295,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.23684210526315788,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.4117647058823529,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.23529411764705882,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "rebus",
+ "score": 0.043478260869565216,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.6904761904761906,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.3,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6000000000000001,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.738095238095238,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.13392857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.05952380952380953,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.6785714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.05,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.23529411764705882,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.21,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.37142857142857144,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.5857142857142856,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.6357142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.5428571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.7310344827586209,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.34285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.7655172413793104,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.5857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.7172413793103448,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.7655172413793103,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.6714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.4896551724137931,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "iq_test",
+ "score": 0.2517241379310345,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.7068965517241379,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.4709677419354839,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.6379310344827587,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.049999999999999996,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.6266666666666668,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.726923076923077,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.4444444444444444,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.25,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.710344827586207,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.7947368421052632,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.4827586206896553,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.1142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.32857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.15714285714285717,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.0071428571428571435,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.05,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.20000000000000004,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.049999999999999996,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.14285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.05714285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.5142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.725,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.11052631578947371,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.535,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.6649999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.9105263157894737,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.3499999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.2857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/SI/POINTS_15_7B/summary_results.json b/static/eval_results/SI/POINTS_15_7B/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..644eeeb1861af7a618903bbbf3e87ee094ef44c8
--- /dev/null
+++ b/static/eval_results/SI/POINTS_15_7B/summary_results.json
@@ -0,0 +1,219 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 273,
+ "num_eval_samples": 4116,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.31355970638319003,
+ "micro_mean_score": 0.30728203432446294,
+ "missing_tasks": []
+ },
+ "open": {
+ "num_eval_tasks": 42,
+ "num_eval_samples": 813,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.41331219301389166,
+ "micro_mean_score": 0.42749077490774917,
+ "missing_tasks": []
+ },
+ "overall_score": 0.32686003793395024
+ },
+ "keyword_stats": {
+ "skills": {
+ "Text Recognition (OCR)": {
+ "count": 101,
+ "num_samples": 1687,
+ "tasks": [],
+ "average_score": 0.3443899066327916
+ },
+ "Object Recognition and Classification": {
+ "count": 172,
+ "num_samples": 2714,
+ "tasks": [],
+ "average_score": 0.3333459246264911
+ },
+ "Scene and Event Understanding": {
+ "count": 60,
+ "num_samples": 1004,
+ "tasks": [],
+ "average_score": 0.43105364189963935
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 91,
+ "num_samples": 1630,
+ "tasks": [],
+ "average_score": 0.28961632718794406
+ },
+ "Language Understanding and Generation": {
+ "count": 102,
+ "num_samples": 1713,
+ "tasks": [],
+ "average_score": 0.35317851821169477
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 46,
+ "num_samples": 897,
+ "tasks": [],
+ "average_score": 0.30711751050032277
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 78,
+ "num_samples": 1273,
+ "tasks": [],
+ "average_score": 0.26796963300870397
+ },
+ "Planning and Decision Making": {
+ "count": 23,
+ "num_samples": 356,
+ "tasks": [],
+ "average_score": 0.08369131166291023
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 38,
+ "num_samples": 654,
+ "tasks": [],
+ "average_score": 0.45980379926019677
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 10,
+ "num_samples": 170,
+ "tasks": [],
+ "average_score": 0.6173496240601504
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 67,
+ "num_samples": 1123,
+ "tasks": [],
+ "average_score": 0.27713077639707523
+ },
+ "Text-Based Images and Documents": {
+ "count": 53,
+ "num_samples": 847,
+ "tasks": [],
+ "average_score": 0.24722440389191766
+ },
+ "Photographs": {
+ "count": 83,
+ "num_samples": 1315,
+ "tasks": [],
+ "average_score": 0.4276343385855984
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 88,
+ "num_samples": 1524,
+ "tasks": [],
+ "average_score": 0.30991539183635347
+ },
+ "Artistic and Creative Content": {
+ "count": 22,
+ "num_samples": 389,
+ "tasks": [],
+ "average_score": 0.37330010041194067
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 2,
+ "num_samples": 30,
+ "tasks": [],
+ "average_score": 0.15572486552610215
+ }
+ },
+ "output_format": {
+ "structured_output": {
+ "count": 72,
+ "num_samples": 1121,
+ "tasks": [],
+ "average_score": 0.3069044183161335
+ },
+ "contextual_formatted_text": {
+ "count": 63,
+ "num_samples": 975,
+ "tasks": [],
+ "average_score": 0.3101162129247054
+ },
+ "exact_text": {
+ "count": 57,
+ "num_samples": 880,
+ "tasks": [],
+ "average_score": 0.2614010338203017
+ },
+ "numerical_data": {
+ "count": 39,
+ "num_samples": 694,
+ "tasks": [],
+ "average_score": 0.28761899055063767
+ },
+ "open_ended_output": {
+ "count": 51,
+ "num_samples": 991,
+ "tasks": [],
+ "average_score": 0.37619796536407
+ },
+ "multiple_choice": {
+ "count": 33,
+ "num_samples": 567,
+ "tasks": [],
+ "average_score": 0.4855568673750491
+ }
+ },
+ "input_num": {
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.32686003793394974
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 41,
+ "num_samples": 644,
+ "tasks": [],
+ "average_score": 0.3095789895735217
+ },
+ "Planning": {
+ "count": 44,
+ "num_samples": 714,
+ "tasks": [],
+ "average_score": 0.1277481304284383
+ },
+ "Coding": {
+ "count": 16,
+ "num_samples": 244,
+ "tasks": [],
+ "average_score": 0.31641062675070025
+ },
+ "Perception": {
+ "count": 82,
+ "num_samples": 1321,
+ "tasks": [],
+ "average_score": 0.4420532221275683
+ },
+ "Metrics": {
+ "count": 3,
+ "num_samples": 45,
+ "tasks": [],
+ "average_score": 0.48095238095238096
+ },
+ "Science": {
+ "count": 22,
+ "num_samples": 469,
+ "tasks": [],
+ "average_score": 0.32551503611448934
+ },
+ "Knowledge": {
+ "count": 77,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.35705988992418164
+ },
+ "Mathematics": {
+ "count": 30,
+ "num_samples": 497,
+ "tasks": [],
+ "average_score": 0.24128406446063128
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/SI/POINTS_15_7B/task_results.json b/static/eval_results/SI/POINTS_15_7B/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b4734ccbd99ebb7e2040b4f938d34fb0b03f975
--- /dev/null
+++ b/static/eval_results/SI/POINTS_15_7B/task_results.json
@@ -0,0 +1,2207 @@
+[
+ {
+ "name": "location_vqa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.02040816326530612,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.17647058823529413,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.6785714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.04081632653061224,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.5468542942842844,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.013803101918097808,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_execution",
+ "score": 0.3125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.06190476190476191,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.6488095238095238,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.3696078431372549,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counting",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.24242424242424243,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.8142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "rebus",
+ "score": 0.043478260869565216,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.047619047619047616,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.29411764705882354,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.0523380101397914,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.2303962026767666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.10526315789473684,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.5476190476190477,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.8338,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.6204559427628572,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.03178571428571437,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.01108560858915326,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "dvqa",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.23277542468896867,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.9375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.8116883116883117,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.6251428571428573,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "algebra",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "iconqa",
+ "score": 0.42105263157894735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.16666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.41471428571428565,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.8822571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.45000000000000007,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.20833333333333334,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.58,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.6339285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.16859258819506148,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.6904761904761905,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.9705882352941178,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_parity",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.59375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.08888888888888888,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.5789473684210527,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.7263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.04314371819298669,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.5803571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.5253333333333335,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "figureqa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.5571578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.03333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.014285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.5952380952380952,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6705882352941177,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.05,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.11764705882352941,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.22023809523809526,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.23013038548752834,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.042857142857142864,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.09999999999999999,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.028571428571428574,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.5310344827586206,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.557142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.4241379310344827,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "iq_test",
+ "score": 0.2586206896551724,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.7500000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.2857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.6034482758620691,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.47777777777777786,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.45517241379310336,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.4620689655172413,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.4344827586206897,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.34137931034482766,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.28,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.5285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.3551724137931033,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.37857142857142856,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.049999999999999996,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.6846153846153845,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.6357142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.24200000000000002,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.5571428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.5516129032258066,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.4517241379310345,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.736842105263158,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.6285714285714284,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.14285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.2357142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.2857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.1642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.3642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.31428571428571433,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.1642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.3928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.1357142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.25000000000000006,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.09285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.11052631578947371,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8842105263157897,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.6849999999999998,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.5199999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.3,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.6549999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/SI/POINTS_7B/summary_results.json b/static/eval_results/SI/POINTS_7B/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..8db3edd07587d8cd344d3781063797b53af6eae2
--- /dev/null
+++ b/static/eval_results/SI/POINTS_7B/summary_results.json
@@ -0,0 +1,219 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 273,
+ "num_eval_samples": 4116,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.25511317681632334,
+ "micro_mean_score": 0.24927711632415062,
+ "missing_tasks": []
+ },
+ "open": {
+ "num_eval_tasks": 42,
+ "num_eval_samples": 813,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.30315625179016,
+ "micro_mean_score": 0.3313653136531366,
+ "missing_tasks": []
+ },
+ "overall_score": 0.26151892014616823
+ },
+ "keyword_stats": {
+ "skills": {
+ "Text Recognition (OCR)": {
+ "count": 101,
+ "num_samples": 1687,
+ "tasks": [],
+ "average_score": 0.2684488868499041
+ },
+ "Object Recognition and Classification": {
+ "count": 172,
+ "num_samples": 2714,
+ "tasks": [],
+ "average_score": 0.27890902837062037
+ },
+ "Scene and Event Understanding": {
+ "count": 60,
+ "num_samples": 1004,
+ "tasks": [],
+ "average_score": 0.37373928195086786
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 91,
+ "num_samples": 1630,
+ "tasks": [],
+ "average_score": 0.22387504020162652
+ },
+ "Language Understanding and Generation": {
+ "count": 102,
+ "num_samples": 1713,
+ "tasks": [],
+ "average_score": 0.2799740367463896
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 46,
+ "num_samples": 897,
+ "tasks": [],
+ "average_score": 0.21311917080615544
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 78,
+ "num_samples": 1273,
+ "tasks": [],
+ "average_score": 0.21857370538972226
+ },
+ "Planning and Decision Making": {
+ "count": 23,
+ "num_samples": 356,
+ "tasks": [],
+ "average_score": 0.06502747891786666
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 38,
+ "num_samples": 654,
+ "tasks": [],
+ "average_score": 0.36827291874151846
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 10,
+ "num_samples": 170,
+ "tasks": [],
+ "average_score": 0.48204135338345855
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 67,
+ "num_samples": 1123,
+ "tasks": [],
+ "average_score": 0.18689735511962233
+ },
+ "Text-Based Images and Documents": {
+ "count": 53,
+ "num_samples": 847,
+ "tasks": [],
+ "average_score": 0.19332242733156837
+ },
+ "Photographs": {
+ "count": 83,
+ "num_samples": 1315,
+ "tasks": [],
+ "average_score": 0.3523684400745285
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 88,
+ "num_samples": 1524,
+ "tasks": [],
+ "average_score": 0.25684059763242745
+ },
+ "Artistic and Creative Content": {
+ "count": 22,
+ "num_samples": 389,
+ "tasks": [],
+ "average_score": 0.33916980654110634
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 2,
+ "num_samples": 30,
+ "tasks": [],
+ "average_score": 0.1499797713556708
+ }
+ },
+ "output_format": {
+ "structured_output": {
+ "count": 72,
+ "num_samples": 1121,
+ "tasks": [],
+ "average_score": 0.2320749867881998
+ },
+ "contextual_formatted_text": {
+ "count": 63,
+ "num_samples": 975,
+ "tasks": [],
+ "average_score": 0.23004840221723208
+ },
+ "exact_text": {
+ "count": 57,
+ "num_samples": 880,
+ "tasks": [],
+ "average_score": 0.239982641771955
+ },
+ "numerical_data": {
+ "count": 39,
+ "num_samples": 694,
+ "tasks": [],
+ "average_score": 0.23646374895042882
+ },
+ "open_ended_output": {
+ "count": 51,
+ "num_samples": 991,
+ "tasks": [],
+ "average_score": 0.28263350209672056
+ },
+ "multiple_choice": {
+ "count": 33,
+ "num_samples": 567,
+ "tasks": [],
+ "average_score": 0.4200183654729108
+ }
+ },
+ "input_num": {
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.2615189201461682
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 41,
+ "num_samples": 644,
+ "tasks": [],
+ "average_score": 0.22503259387671015
+ },
+ "Planning": {
+ "count": 44,
+ "num_samples": 714,
+ "tasks": [],
+ "average_score": 0.08476480516686397
+ },
+ "Coding": {
+ "count": 16,
+ "num_samples": 244,
+ "tasks": [],
+ "average_score": 0.3151282387955181
+ },
+ "Perception": {
+ "count": 82,
+ "num_samples": 1321,
+ "tasks": [],
+ "average_score": 0.3737263982731003
+ },
+ "Metrics": {
+ "count": 3,
+ "num_samples": 45,
+ "tasks": [],
+ "average_score": 0.34761904761904755
+ },
+ "Science": {
+ "count": 22,
+ "num_samples": 469,
+ "tasks": [],
+ "average_score": 0.2606187882141402
+ },
+ "Knowledge": {
+ "count": 77,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.27361452525243724
+ },
+ "Mathematics": {
+ "count": 30,
+ "num_samples": 497,
+ "tasks": [],
+ "average_score": 0.19633555542091463
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/SI/POINTS_7B/task_results.json b/static/eval_results/SI/POINTS_7B/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b8da2e4943c6b9f525120acfd8b837a93cc11194
--- /dev/null
+++ b/static/eval_results/SI/POINTS_7B/task_results.json
@@ -0,0 +1,2207 @@
+[
+ {
+ "name": "location_vqa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.08163265306122448,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.29411764705882354,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.061224489795918366,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.6019887092978411,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.3157894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.2619047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.024872434514772418,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_execution",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.09455782312925169,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.3058823529411765,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counting",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.1717171717171717,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "rebus",
+ "score": 0.08695652173913043,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.11764705882352941,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.016800837996620143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.43135590531192486,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.07894736842105263,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.6666666666666664,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.21949473684210527,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.6984126984126984,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.12450104213939192,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.18628758352725225,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "dvqa",
+ "score": 0.6842105263157895,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.27046408475418426,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.9375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.525974025974026,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.3725,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "algebra",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "iconqa",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.4362142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.6044285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.3469387755102041,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.08333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.16,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.08567382842562732,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.04761904761904762,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.3529411764705882,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_parity",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.15625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.42105263157894735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.7157894736842105,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.04747631990100836,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.20535714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.17799999999999994,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "figureqa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.37305263157894736,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.03333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.0071428571428571435,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.6904761904761905,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6117647058823532,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.29411764705882354,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.17091836734693877,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.1685090702947846,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.014285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.028571428571428574,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.014285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.46206896551724125,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.1642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.34827586206896544,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "iq_test",
+ "score": 0.22413793103448276,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.14285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.23571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.4103448275862068,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.34444444444444444,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.23793103448275862,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.44482758620689644,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.3793103448275862,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.4241379310344828,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.29999999999999993,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.37857142857142856,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.26551724137931026,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.4857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.06428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.6769230769230771,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.3142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.13199999999999998,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.2928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.48387096774193544,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.3206896551724137,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.7105263157894736,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.45,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.07142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.0071428571428571435,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.04285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.0071428571428571435,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.014285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.014285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.37857142857142856,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.1,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.07142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.1,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.836842105263158,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.6900000000000002,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.6099999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.37,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.625,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/SI/Phi-3.5-vision/summary_results.json b/static/eval_results/SI/Phi-3.5-vision/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b7443fff17692c8d16a8171ae077403abf95772d
--- /dev/null
+++ b/static/eval_results/SI/Phi-3.5-vision/summary_results.json
@@ -0,0 +1,219 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 273,
+ "num_eval_samples": 4116,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.2561274958722834,
+ "micro_mean_score": 0.2504214576875906,
+ "missing_tasks": []
+ },
+ "open": {
+ "num_eval_tasks": 42,
+ "num_eval_samples": 813,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.4272267419054576,
+ "micro_mean_score": 0.445879458794588,
+ "missing_tasks": []
+ },
+ "overall_score": 0.2789407286767066
+ },
+ "keyword_stats": {
+ "skills": {
+ "Text Recognition (OCR)": {
+ "count": 101,
+ "num_samples": 1687,
+ "tasks": [],
+ "average_score": 0.2682909697086125
+ },
+ "Object Recognition and Classification": {
+ "count": 172,
+ "num_samples": 2714,
+ "tasks": [],
+ "average_score": 0.2845968124529633
+ },
+ "Scene and Event Understanding": {
+ "count": 60,
+ "num_samples": 1004,
+ "tasks": [],
+ "average_score": 0.4299430434813172
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 91,
+ "num_samples": 1630,
+ "tasks": [],
+ "average_score": 0.22905610983444738
+ },
+ "Language Understanding and Generation": {
+ "count": 102,
+ "num_samples": 1713,
+ "tasks": [],
+ "average_score": 0.3097558922032538
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 46,
+ "num_samples": 897,
+ "tasks": [],
+ "average_score": 0.26422404318271525
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 78,
+ "num_samples": 1273,
+ "tasks": [],
+ "average_score": 0.21524515429041854
+ },
+ "Planning and Decision Making": {
+ "count": 23,
+ "num_samples": 356,
+ "tasks": [],
+ "average_score": 0.08173397535709728
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 38,
+ "num_samples": 654,
+ "tasks": [],
+ "average_score": 0.44526176399160444
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 10,
+ "num_samples": 170,
+ "tasks": [],
+ "average_score": 0.6958045112781954
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 67,
+ "num_samples": 1123,
+ "tasks": [],
+ "average_score": 0.18482544209393917
+ },
+ "Text-Based Images and Documents": {
+ "count": 53,
+ "num_samples": 847,
+ "tasks": [],
+ "average_score": 0.1852656532829957
+ },
+ "Photographs": {
+ "count": 83,
+ "num_samples": 1315,
+ "tasks": [],
+ "average_score": 0.4073649042468842
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 88,
+ "num_samples": 1524,
+ "tasks": [],
+ "average_score": 0.2797292831010349
+ },
+ "Artistic and Creative Content": {
+ "count": 22,
+ "num_samples": 389,
+ "tasks": [],
+ "average_score": 0.3249099963089235
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 2,
+ "num_samples": 30,
+ "tasks": [],
+ "average_score": 0.04423070234557053
+ }
+ },
+ "output_format": {
+ "structured_output": {
+ "count": 72,
+ "num_samples": 1121,
+ "tasks": [],
+ "average_score": 0.2567782320477167
+ },
+ "contextual_formatted_text": {
+ "count": 63,
+ "num_samples": 975,
+ "tasks": [],
+ "average_score": 0.2141318618135909
+ },
+ "exact_text": {
+ "count": 57,
+ "num_samples": 880,
+ "tasks": [],
+ "average_score": 0.23002523914604356
+ },
+ "numerical_data": {
+ "count": 39,
+ "num_samples": 694,
+ "tasks": [],
+ "average_score": 0.20335546763980886
+ },
+ "open_ended_output": {
+ "count": 51,
+ "num_samples": 991,
+ "tasks": [],
+ "average_score": 0.38510487366381607
+ },
+ "multiple_choice": {
+ "count": 33,
+ "num_samples": 567,
+ "tasks": [],
+ "average_score": 0.46076785167694245
+ }
+ },
+ "input_num": {
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.2789407286767065
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 41,
+ "num_samples": 644,
+ "tasks": [],
+ "average_score": 0.18412184931451608
+ },
+ "Planning": {
+ "count": 44,
+ "num_samples": 714,
+ "tasks": [],
+ "average_score": 0.09254779551496593
+ },
+ "Coding": {
+ "count": 16,
+ "num_samples": 244,
+ "tasks": [],
+ "average_score": 0.3150531045751634
+ },
+ "Perception": {
+ "count": 82,
+ "num_samples": 1321,
+ "tasks": [],
+ "average_score": 0.38360617573843164
+ },
+ "Metrics": {
+ "count": 3,
+ "num_samples": 45,
+ "tasks": [],
+ "average_score": 0.4142857142857143
+ },
+ "Science": {
+ "count": 22,
+ "num_samples": 469,
+ "tasks": [],
+ "average_score": 0.3034971430938622
+ },
+ "Knowledge": {
+ "count": 77,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.3374902354661273
+ },
+ "Mathematics": {
+ "count": 30,
+ "num_samples": 497,
+ "tasks": [],
+ "average_score": 0.19473774010136682
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/SI/Phi-3.5-vision/task_results.json b/static/eval_results/SI/Phi-3.5-vision/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b3aff65bd67780f32d3dfcac0b21af7487ca36c4
--- /dev/null
+++ b/static/eval_results/SI/Phi-3.5-vision/task_results.json
@@ -0,0 +1,2207 @@
+[
+ {
+ "name": "location_vqa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.08163265306122448,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.4117647058823529,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.10204081632653061,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.12462000961394672,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.11904761904761904,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.15,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.029751219517657808,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_execution",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.018367346938775512,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.6488095238095238,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.7777777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.15686274509803924,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counting",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.12121212121212122,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.042857142857142864,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "rebus",
+ "score": 0.043478260869565216,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.23529411764705882,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.04535618459802735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.10526315789473684,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.40476190476190477,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.4628842105263158,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.5246016576544484,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.6269841269841271,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.7891263675852077,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.03700000000000008,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.032668828291194586,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "dvqa",
+ "score": 0.6842105263157895,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.35448451238579437,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.8125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.5259740259740259,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.7468571428571432,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "algebra",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "iconqa",
+ "score": 0.2631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.3,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.5454285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.16666666666666669,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.9523071428571426,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.28571428571428564,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.13571428571428573,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.020833333333333332,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.52,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.017032833262569633,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.46825396825396826,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.6260504201680673,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_parity",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.40625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.24444444444444446,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.6657894736842106,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.21324372091628846,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.44444444444444436,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.2767857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.22333333333333344,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "figureqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.3399232403523023,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.13326315789473686,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.4285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.5476190476190477,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6470588235294118,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.16071428571428573,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.15972222222222224,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.6724137931034485,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.3785714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.4517241379310346,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.7214285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.34285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.3555555555555555,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.39655172413793116,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.5275862068965518,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.7310344827586208,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.4448275862068965,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.26666666666666666,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.2448275862068965,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.5000000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.27142857142857146,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.6115384615384616,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.5857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.08,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.5857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.4064516129032259,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.42758620689655175,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.7368421052631579,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.47857142857142854,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.07142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.19999999999999998,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.0071428571428571435,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.10714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.0642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.09999999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.09999999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.49999999999999994,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.5142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.38571428571428573,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.21428571428571427,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8894736842105264,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.89,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "iq_test",
+ "score": 0.24137931034482757,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.48620689655172417,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.46428571428571436,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.6749999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.915,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.8,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/SI/Pixtral_12B/summary_results.json b/static/eval_results/SI/Pixtral_12B/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..cf0689e6dbc300e655b0ab20bbcac39388d1c437
--- /dev/null
+++ b/static/eval_results/SI/Pixtral_12B/summary_results.json
@@ -0,0 +1,219 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 273,
+ "num_eval_samples": 4116,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.3436942439614412,
+ "micro_mean_score": 0.3373564384613738,
+ "missing_tasks": []
+ },
+ "open": {
+ "num_eval_tasks": 42,
+ "num_eval_samples": 813,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.4417271955536318,
+ "micro_mean_score": 0.4845633456334564,
+ "missing_tasks": []
+ },
+ "overall_score": 0.3567653041737333
+ },
+ "keyword_stats": {
+ "skills": {
+ "Text Recognition (OCR)": {
+ "count": 101,
+ "num_samples": 1687,
+ "tasks": [],
+ "average_score": 0.39551360119171197
+ },
+ "Object Recognition and Classification": {
+ "count": 172,
+ "num_samples": 2714,
+ "tasks": [],
+ "average_score": 0.37359181974124417
+ },
+ "Scene and Event Understanding": {
+ "count": 60,
+ "num_samples": 1004,
+ "tasks": [],
+ "average_score": 0.4677006268371793
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 91,
+ "num_samples": 1630,
+ "tasks": [],
+ "average_score": 0.3055711926752603
+ },
+ "Language Understanding and Generation": {
+ "count": 102,
+ "num_samples": 1713,
+ "tasks": [],
+ "average_score": 0.38842270268832113
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 46,
+ "num_samples": 897,
+ "tasks": [],
+ "average_score": 0.35085932465399283
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 78,
+ "num_samples": 1273,
+ "tasks": [],
+ "average_score": 0.28269833721806076
+ },
+ "Planning and Decision Making": {
+ "count": 23,
+ "num_samples": 356,
+ "tasks": [],
+ "average_score": 0.08507904212012304
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 38,
+ "num_samples": 654,
+ "tasks": [],
+ "average_score": 0.4193828210432134
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 10,
+ "num_samples": 170,
+ "tasks": [],
+ "average_score": 0.6302142857142857
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 67,
+ "num_samples": 1123,
+ "tasks": [],
+ "average_score": 0.31669784888602887
+ },
+ "Text-Based Images and Documents": {
+ "count": 53,
+ "num_samples": 847,
+ "tasks": [],
+ "average_score": 0.2688429906381188
+ },
+ "Photographs": {
+ "count": 83,
+ "num_samples": 1315,
+ "tasks": [],
+ "average_score": 0.4327891810625066
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 88,
+ "num_samples": 1524,
+ "tasks": [],
+ "average_score": 0.36461586731895695
+ },
+ "Artistic and Creative Content": {
+ "count": 22,
+ "num_samples": 389,
+ "tasks": [],
+ "average_score": 0.3947713702430871
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 2,
+ "num_samples": 30,
+ "tasks": [],
+ "average_score": 0.11048396896880823
+ }
+ },
+ "output_format": {
+ "structured_output": {
+ "count": 72,
+ "num_samples": 1121,
+ "tasks": [],
+ "average_score": 0.36511340930610364
+ },
+ "contextual_formatted_text": {
+ "count": 63,
+ "num_samples": 975,
+ "tasks": [],
+ "average_score": 0.3161209026018942
+ },
+ "exact_text": {
+ "count": 57,
+ "num_samples": 880,
+ "tasks": [],
+ "average_score": 0.29510067482559116
+ },
+ "numerical_data": {
+ "count": 39,
+ "num_samples": 694,
+ "tasks": [],
+ "average_score": 0.3135393276021012
+ },
+ "open_ended_output": {
+ "count": 51,
+ "num_samples": 991,
+ "tasks": [],
+ "average_score": 0.3995518703501119
+ },
+ "multiple_choice": {
+ "count": 33,
+ "num_samples": 567,
+ "tasks": [],
+ "average_score": 0.5076172985263894
+ }
+ },
+ "input_num": {
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.3567653041737331
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 41,
+ "num_samples": 644,
+ "tasks": [],
+ "average_score": 0.4143415072482432
+ },
+ "Planning": {
+ "count": 44,
+ "num_samples": 714,
+ "tasks": [],
+ "average_score": 0.122839038193565
+ },
+ "Coding": {
+ "count": 16,
+ "num_samples": 244,
+ "tasks": [],
+ "average_score": 0.3689221521942111
+ },
+ "Perception": {
+ "count": 82,
+ "num_samples": 1321,
+ "tasks": [],
+ "average_score": 0.46377210154054166
+ },
+ "Metrics": {
+ "count": 3,
+ "num_samples": 45,
+ "tasks": [],
+ "average_score": 0.3444444444444444
+ },
+ "Science": {
+ "count": 22,
+ "num_samples": 469,
+ "tasks": [],
+ "average_score": 0.35876745089800455
+ },
+ "Knowledge": {
+ "count": 77,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.37374171749764634
+ },
+ "Mathematics": {
+ "count": 30,
+ "num_samples": 497,
+ "tasks": [],
+ "average_score": 0.27839183583970506
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/SI/Pixtral_12B/task_results.json b/static/eval_results/SI/Pixtral_12B/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6db51d07c023f8f5a8a4b56fcad623fa9902f0e
--- /dev/null
+++ b/static/eval_results/SI/Pixtral_12B/task_results.json
@@ -0,0 +1,2207 @@
+[
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.21666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.01728571428571422,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.74,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.20833333333333334,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.3819734783418875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.006682223651902177,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "dvqa",
+ "score": 0.9473684210526315,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.5101908757577766,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.8961038961038961,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.22936526304647478,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.636,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.7556321946743035,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.4888888888888888,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.9369747899159664,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.7302105263157892,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.7460317460317462,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.5540115257336367,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "figureqa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.41168421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.6427142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.3684210526315789,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.5071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_parity",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.8125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.48214285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.7896825396825398,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "algebra",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.9698285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.5464285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.15356401612245238,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.48218947368421045,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "iconqa",
+ "score": 0.3684210526315789,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.6160714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.5357142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.28571428571428575,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.20000000000000004,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.05714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.2703724018737771,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.9047619047619048,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.16326530612244897,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.3052125197255065,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.04081632653061224,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.5789473684210527,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counting",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.619047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.23232323232323232,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.3980392156862745,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.48256296698343887,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.23214285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_execution",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.6785714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.6785714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.3095238095238095,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.018168314191328267,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_comparison",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.23684210526315788,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.4117647058823529,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.17142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.23529411764705882,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "rebus",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.619047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.08571428571428573,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6470588235294118,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.2057823129251701,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.2205215419501134,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.3,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.47058823529411764,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.22200000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.32857142857142846,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.5999999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.8500000000000002,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.2785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.7379310344827585,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.6827586206896552,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.4928571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.5827586206896551,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "iq_test",
+ "score": 0.29655172413793096,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.40714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.7689655172413793,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.4111111111111112,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.7379310344827585,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.6896551724137933,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.4586206896551724,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.3933333333333333,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.47857142857142854,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.4827586206896552,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.2,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.7461538461538463,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.46451612903225814,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.6448275862068966,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.8105263157894738,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.6642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.35714285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.09285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.014285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.0071428571428571435,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.028571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.0071428571428571435,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.028571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.4142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.10714285714285718,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.2571428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.09285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.9,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.72,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.69,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.54,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.7649999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/SI/Qwen2_VL_2B/summary_results.json b/static/eval_results/SI/Qwen2_VL_2B/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..9b971e81cae22201b809b20a03940d5a8fa91adb
--- /dev/null
+++ b/static/eval_results/SI/Qwen2_VL_2B/summary_results.json
@@ -0,0 +1,219 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 273,
+ "num_eval_samples": 4116,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.22787906973244856,
+ "micro_mean_score": 0.2234748515064842,
+ "missing_tasks": []
+ },
+ "open": {
+ "num_eval_tasks": 42,
+ "num_eval_samples": 813,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.3509364634962041,
+ "micro_mean_score": 0.3768757687576875,
+ "missing_tasks": []
+ },
+ "overall_score": 0.24428672223428263
+ },
+ "keyword_stats": {
+ "skills": {
+ "Text Recognition (OCR)": {
+ "count": 101,
+ "num_samples": 1687,
+ "tasks": [],
+ "average_score": 0.2253353309586889
+ },
+ "Object Recognition and Classification": {
+ "count": 172,
+ "num_samples": 2714,
+ "tasks": [],
+ "average_score": 0.25965511679594977
+ },
+ "Scene and Event Understanding": {
+ "count": 60,
+ "num_samples": 1004,
+ "tasks": [],
+ "average_score": 0.3778480095314066
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 91,
+ "num_samples": 1630,
+ "tasks": [],
+ "average_score": 0.19211647307230917
+ },
+ "Language Understanding and Generation": {
+ "count": 102,
+ "num_samples": 1713,
+ "tasks": [],
+ "average_score": 0.27091980735233906
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 46,
+ "num_samples": 897,
+ "tasks": [],
+ "average_score": 0.21906286524745977
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 78,
+ "num_samples": 1273,
+ "tasks": [],
+ "average_score": 0.19305913502727232
+ },
+ "Planning and Decision Making": {
+ "count": 23,
+ "num_samples": 356,
+ "tasks": [],
+ "average_score": 0.07432337143230854
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 38,
+ "num_samples": 654,
+ "tasks": [],
+ "average_score": 0.37769658880841467
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 10,
+ "num_samples": 170,
+ "tasks": [],
+ "average_score": 0.5887067669172933
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 67,
+ "num_samples": 1123,
+ "tasks": [],
+ "average_score": 0.1930642044577058
+ },
+ "Text-Based Images and Documents": {
+ "count": 53,
+ "num_samples": 847,
+ "tasks": [],
+ "average_score": 0.13312812081322709
+ },
+ "Photographs": {
+ "count": 83,
+ "num_samples": 1315,
+ "tasks": [],
+ "average_score": 0.36205043973893236
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 88,
+ "num_samples": 1524,
+ "tasks": [],
+ "average_score": 0.23259922343062173
+ },
+ "Artistic and Creative Content": {
+ "count": 22,
+ "num_samples": 389,
+ "tasks": [],
+ "average_score": 0.2842921728720087
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 2,
+ "num_samples": 30,
+ "tasks": [],
+ "average_score": 0.09293971931071163
+ }
+ },
+ "output_format": {
+ "structured_output": {
+ "count": 72,
+ "num_samples": 1121,
+ "tasks": [],
+ "average_score": 0.23210528191388644
+ },
+ "contextual_formatted_text": {
+ "count": 63,
+ "num_samples": 975,
+ "tasks": [],
+ "average_score": 0.1652854805017628
+ },
+ "exact_text": {
+ "count": 57,
+ "num_samples": 880,
+ "tasks": [],
+ "average_score": 0.17061075451792151
+ },
+ "numerical_data": {
+ "count": 39,
+ "num_samples": 694,
+ "tasks": [],
+ "average_score": 0.23904036592289388
+ },
+ "open_ended_output": {
+ "count": 51,
+ "num_samples": 991,
+ "tasks": [],
+ "average_score": 0.3296071840681468
+ },
+ "multiple_choice": {
+ "count": 33,
+ "num_samples": 567,
+ "tasks": [],
+ "average_score": 0.42328479601206864
+ }
+ },
+ "input_num": {
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.24428672223428244
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 41,
+ "num_samples": 644,
+ "tasks": [],
+ "average_score": 0.21948696002702636
+ },
+ "Planning": {
+ "count": 44,
+ "num_samples": 714,
+ "tasks": [],
+ "average_score": 0.08656714113327156
+ },
+ "Coding": {
+ "count": 16,
+ "num_samples": 244,
+ "tasks": [],
+ "average_score": 0.18075323879551822
+ },
+ "Perception": {
+ "count": 82,
+ "num_samples": 1321,
+ "tasks": [],
+ "average_score": 0.33781829803679747
+ },
+ "Metrics": {
+ "count": 3,
+ "num_samples": 45,
+ "tasks": [],
+ "average_score": 0.36984126984126986
+ },
+ "Science": {
+ "count": 22,
+ "num_samples": 469,
+ "tasks": [],
+ "average_score": 0.2448949597527861
+ },
+ "Knowledge": {
+ "count": 77,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.28841305815072016
+ },
+ "Mathematics": {
+ "count": 30,
+ "num_samples": 497,
+ "tasks": [],
+ "average_score": 0.16147424237969243
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/SI/Qwen2_VL_2B/task_results.json b/static/eval_results/SI/Qwen2_VL_2B/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e11e8fd8508449a25de25e2575f1b61f16033db5
--- /dev/null
+++ b/static/eval_results/SI/Qwen2_VL_2B/task_results.json
@@ -0,0 +1,2207 @@
+[
+ {
+ "name": "location_vqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.02040816326530612,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.23529411764705882,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.04081632653061224,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.5018438475034174,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.6842105263157895,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.2380952380952381,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.03287939197004419,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_execution",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.10884353741496598,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.22794117647058823,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counting",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.1414141414141414,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.21428571428571433,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "rebus",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.10416666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.11764705882352941,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.07779907911988974,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.044674076957756666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.4499764488395411,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.12280701754385963,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.16666666666666669,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.36160000000000003,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.5182194949333015,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.3571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.848854419078294,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.058861588605067065,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.36607142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "dvqa",
+ "score": 0.631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.18447073719814694,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.6623376623376623,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.12299999999999996,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "algebra",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "iconqa",
+ "score": 0.2631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.11666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.37485714285714283,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.38095238095238093,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.8766142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.23469387755102034,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.19285714285714284,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.66,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.5178571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.043022295764280405,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.5238095238095238,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.7058823529411764,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_parity",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.40625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.5111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.5589473684210526,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.1887112140198952,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.15789473684210525,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.17777777777777776,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.4017857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.46666666666666673,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "figureqa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.35158243862450156,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.513578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.0071428571428571435,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.380952380952381,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.4761904761904762,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.7058823529411766,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.18579931972789115,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.1149092970521542,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.05555555555555555,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.014285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.014285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.014285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.4068965517241379,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.3071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.39310344827586197,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "iq_test",
+ "score": 0.23448275862068968,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.4928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.12142857142857146,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.5379310344827586,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.12222222222222223,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.3310344827586207,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.5034482758620689,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.506896551724138,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.296551724137931,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.5533333333333333,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.5142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.5206896551724136,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.0642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.29285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.5692307692307693,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.2285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.07200000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.2285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.5193548387096775,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.32068965517241377,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.7052631578947368,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.5499999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.2857142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.17142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.11428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.05714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.23571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.04285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.10714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.4428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.1142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.2571428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.09999999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.26842105263157895,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8842105263157897,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.7050000000000002,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.62,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.31499999999999995,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.6250000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/SI/Qwen2_VL_72B/summary_results.json b/static/eval_results/SI/Qwen2_VL_72B/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..ede9d54993b54c73ddf7fd14fa46ff74244d04e5
--- /dev/null
+++ b/static/eval_results/SI/Qwen2_VL_72B/summary_results.json
@@ -0,0 +1,219 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 273,
+ "num_eval_samples": 4116,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.4730536307784527,
+ "micro_mean_score": 0.4659830915476831,
+ "missing_tasks": []
+ },
+ "open": {
+ "num_eval_tasks": 42,
+ "num_eval_samples": 813,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.5510079982505317,
+ "micro_mean_score": 0.5826568265682657,
+ "missing_tasks": []
+ },
+ "overall_score": 0.48344754644139654
+ },
+ "keyword_stats": {
+ "skills": {
+ "Text Recognition (OCR)": {
+ "count": 101,
+ "num_samples": 1687,
+ "tasks": [],
+ "average_score": 0.5688395686544739
+ },
+ "Object Recognition and Classification": {
+ "count": 172,
+ "num_samples": 2714,
+ "tasks": [],
+ "average_score": 0.49559260360544427
+ },
+ "Scene and Event Understanding": {
+ "count": 60,
+ "num_samples": 1004,
+ "tasks": [],
+ "average_score": 0.6040487985710314
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 91,
+ "num_samples": 1630,
+ "tasks": [],
+ "average_score": 0.40095954140813556
+ },
+ "Language Understanding and Generation": {
+ "count": 102,
+ "num_samples": 1713,
+ "tasks": [],
+ "average_score": 0.5387802130987105
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 46,
+ "num_samples": 897,
+ "tasks": [],
+ "average_score": 0.43580017776139807
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 78,
+ "num_samples": 1273,
+ "tasks": [],
+ "average_score": 0.367367170491919
+ },
+ "Planning and Decision Making": {
+ "count": 23,
+ "num_samples": 356,
+ "tasks": [],
+ "average_score": 0.1474368760019346
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 38,
+ "num_samples": 654,
+ "tasks": [],
+ "average_score": 0.5782670824874114
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 10,
+ "num_samples": 170,
+ "tasks": [],
+ "average_score": 0.7294097744360902
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 67,
+ "num_samples": 1123,
+ "tasks": [],
+ "average_score": 0.5070634902661117
+ },
+ "Text-Based Images and Documents": {
+ "count": 53,
+ "num_samples": 847,
+ "tasks": [],
+ "average_score": 0.4333175250859433
+ },
+ "Photographs": {
+ "count": 83,
+ "num_samples": 1315,
+ "tasks": [],
+ "average_score": 0.5308367876160253
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 88,
+ "num_samples": 1524,
+ "tasks": [],
+ "average_score": 0.4473618716373871
+ },
+ "Artistic and Creative Content": {
+ "count": 22,
+ "num_samples": 389,
+ "tasks": [],
+ "average_score": 0.5251544991587351
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 2,
+ "num_samples": 30,
+ "tasks": [],
+ "average_score": 0.18309869697155778
+ }
+ },
+ "output_format": {
+ "structured_output": {
+ "count": 72,
+ "num_samples": 1121,
+ "tasks": [],
+ "average_score": 0.49191356756271953
+ },
+ "contextual_formatted_text": {
+ "count": 63,
+ "num_samples": 975,
+ "tasks": [],
+ "average_score": 0.45605294241715827
+ },
+ "exact_text": {
+ "count": 57,
+ "num_samples": 880,
+ "tasks": [],
+ "average_score": 0.4608929319719144
+ },
+ "numerical_data": {
+ "count": 39,
+ "num_samples": 694,
+ "tasks": [],
+ "average_score": 0.44066773476234555
+ },
+ "open_ended_output": {
+ "count": 51,
+ "num_samples": 991,
+ "tasks": [],
+ "average_score": 0.4974532098882374
+ },
+ "multiple_choice": {
+ "count": 33,
+ "num_samples": 567,
+ "tasks": [],
+ "average_score": 0.5851458306003763
+ }
+ },
+ "input_num": {
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.4834475464413966
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 41,
+ "num_samples": 644,
+ "tasks": [],
+ "average_score": 0.6323628750211533
+ },
+ "Planning": {
+ "count": 44,
+ "num_samples": 714,
+ "tasks": [],
+ "average_score": 0.23323065689783842
+ },
+ "Coding": {
+ "count": 16,
+ "num_samples": 244,
+ "tasks": [],
+ "average_score": 0.48352372198879545
+ },
+ "Perception": {
+ "count": 82,
+ "num_samples": 1321,
+ "tasks": [],
+ "average_score": 0.6141225191470527
+ },
+ "Metrics": {
+ "count": 3,
+ "num_samples": 45,
+ "tasks": [],
+ "average_score": 0.39365079365079364
+ },
+ "Science": {
+ "count": 22,
+ "num_samples": 469,
+ "tasks": [],
+ "average_score": 0.41914085094672937
+ },
+ "Knowledge": {
+ "count": 77,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.4874613649312476
+ },
+ "Mathematics": {
+ "count": 30,
+ "num_samples": 497,
+ "tasks": [],
+ "average_score": 0.3355316008767396
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/SI/Qwen2_VL_72B/task_results.json b/static/eval_results/SI/Qwen2_VL_72B/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..723efdfd25a66a6bcdb1251d77044c84327645ee
--- /dev/null
+++ b/static/eval_results/SI/Qwen2_VL_72B/task_results.json
@@ -0,0 +1,2207 @@
+[
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.38333333333333336,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.35550000000000015,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.5510204081632651,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.84,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.22916666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.4347808225785918,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.1519116796574013,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "dvqa",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.7396930070845659,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.9415584415584416,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.25316516240287096,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.6546666666666668,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.6090597441949396,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.5535353535353535,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.9747899159663866,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.7573684210526316,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.761904761904762,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.7785773738753342,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "figureqa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.6320526315789474,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.7822142857142859,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.6785714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_parity",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.8125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.7410714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.9404761904761906,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "algebra",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.9375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.9441071428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.6255,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.273716885964573,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.6151315789473685,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "iconqa",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.6339285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.7261904761904762,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.7321428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.16666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.27142857142857146,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.3142857142857144,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.15714285714285717,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.1142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.469394874670551,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.021690583803327123,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.9,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.1836734693877551,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.4790877604978875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.20408163265306123,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.6842105263157895,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counting",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.3939393939393939,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.4504901960784313,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.4897466817225849,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.4583333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_execution",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.40476190476190477,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.026882674514698886,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mahjong",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_comparison",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.21052631578947367,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.4117647058823529,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.2571428571428572,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.29411764705882354,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "rebus",
+ "score": 0.21739130434782608,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.9047619047619048,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.3642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.5529411764705883,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.46811224489795916,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.4113378684807256,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.15,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.7647058823529411,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.29200000000000004,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.3071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.6928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.8714285714285716,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.75,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.793103448275862,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.5857142857142856,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.7413793103448277,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.6928571428571432,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.813793103448276,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.724137931034483,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.75,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.6241379310344828,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "iq_test",
+ "score": 0.43793103448275855,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.7827586206896552,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.6516129032258065,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.7448275862068966,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.4785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.44666666666666666,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.7307692307692307,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.6888888888888888,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.5499999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.7413793103448275,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.7894736842105264,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.6241379310344829,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.2928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.19999999999999996,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.2,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.16428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.45714285714285713,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.36428571428571427,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.35,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.507142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.1142857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.6357142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.6949999999999998,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.11052631578947371,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.6250000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.6449999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.910526315789474,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.5650000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/SI/Qwen2_VL_7B/summary_results.json b/static/eval_results/SI/Qwen2_VL_7B/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..c602e711d4391693f5f1065275958be22caa3a30
--- /dev/null
+++ b/static/eval_results/SI/Qwen2_VL_7B/summary_results.json
@@ -0,0 +1,219 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 273,
+ "num_eval_samples": 4116,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.3538656561495699,
+ "micro_mean_score": 0.34581250459157137,
+ "missing_tasks": []
+ },
+ "open": {
+ "num_eval_tasks": 42,
+ "num_eval_samples": 813,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.4517429592549692,
+ "micro_mean_score": 0.4730012300123002,
+ "missing_tasks": []
+ },
+ "overall_score": 0.3669159632302898
+ },
+ "keyword_stats": {
+ "skills": {
+ "Text Recognition (OCR)": {
+ "count": 101,
+ "num_samples": 1687,
+ "tasks": [],
+ "average_score": 0.40533138482347386
+ },
+ "Object Recognition and Classification": {
+ "count": 172,
+ "num_samples": 2714,
+ "tasks": [],
+ "average_score": 0.3844930054666535
+ },
+ "Scene and Event Understanding": {
+ "count": 60,
+ "num_samples": 1004,
+ "tasks": [],
+ "average_score": 0.5151962864568788
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 91,
+ "num_samples": 1630,
+ "tasks": [],
+ "average_score": 0.28799562910106935
+ },
+ "Language Understanding and Generation": {
+ "count": 102,
+ "num_samples": 1713,
+ "tasks": [],
+ "average_score": 0.41129100495999377
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 46,
+ "num_samples": 897,
+ "tasks": [],
+ "average_score": 0.31735419703044254
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 78,
+ "num_samples": 1273,
+ "tasks": [],
+ "average_score": 0.2780300019986884
+ },
+ "Planning and Decision Making": {
+ "count": 23,
+ "num_samples": 356,
+ "tasks": [],
+ "average_score": 0.08484497782236566
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 38,
+ "num_samples": 654,
+ "tasks": [],
+ "average_score": 0.4927960336040459
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 10,
+ "num_samples": 170,
+ "tasks": [],
+ "average_score": 0.6569285714285714
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 67,
+ "num_samples": 1123,
+ "tasks": [],
+ "average_score": 0.3555822260000372
+ },
+ "Text-Based Images and Documents": {
+ "count": 53,
+ "num_samples": 847,
+ "tasks": [],
+ "average_score": 0.27651089530142536
+ },
+ "Photographs": {
+ "count": 83,
+ "num_samples": 1315,
+ "tasks": [],
+ "average_score": 0.4722059967533397
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 88,
+ "num_samples": 1524,
+ "tasks": [],
+ "average_score": 0.3279988413468837
+ },
+ "Artistic and Creative Content": {
+ "count": 22,
+ "num_samples": 389,
+ "tasks": [],
+ "average_score": 0.39575781162159634
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 2,
+ "num_samples": 30,
+ "tasks": [],
+ "average_score": 0.167887099917599
+ }
+ },
+ "output_format": {
+ "structured_output": {
+ "count": 72,
+ "num_samples": 1121,
+ "tasks": [],
+ "average_score": 0.38908261098680974
+ },
+ "contextual_formatted_text": {
+ "count": 63,
+ "num_samples": 975,
+ "tasks": [],
+ "average_score": 0.3343930759222326
+ },
+ "exact_text": {
+ "count": 57,
+ "num_samples": 880,
+ "tasks": [],
+ "average_score": 0.3068323820854221
+ },
+ "numerical_data": {
+ "count": 39,
+ "num_samples": 694,
+ "tasks": [],
+ "average_score": 0.31569247186288174
+ },
+ "open_ended_output": {
+ "count": 51,
+ "num_samples": 991,
+ "tasks": [],
+ "average_score": 0.39180263622429157
+ },
+ "multiple_choice": {
+ "count": 33,
+ "num_samples": 567,
+ "tasks": [],
+ "average_score": 0.5064978792251521
+ }
+ },
+ "input_num": {
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.3669159632302898
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 41,
+ "num_samples": 644,
+ "tasks": [],
+ "average_score": 0.44618631789079277
+ },
+ "Planning": {
+ "count": 44,
+ "num_samples": 714,
+ "tasks": [],
+ "average_score": 0.13224920829749706
+ },
+ "Coding": {
+ "count": 16,
+ "num_samples": 244,
+ "tasks": [],
+ "average_score": 0.36572347689075624
+ },
+ "Perception": {
+ "count": 82,
+ "num_samples": 1321,
+ "tasks": [],
+ "average_score": 0.47845712831657317
+ },
+ "Metrics": {
+ "count": 3,
+ "num_samples": 45,
+ "tasks": [],
+ "average_score": 0.3507936507936508
+ },
+ "Science": {
+ "count": 22,
+ "num_samples": 469,
+ "tasks": [],
+ "average_score": 0.28910547521894076
+ },
+ "Knowledge": {
+ "count": 77,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.40527029084195965
+ },
+ "Mathematics": {
+ "count": 30,
+ "num_samples": 497,
+ "tasks": [],
+ "average_score": 0.25874500882297563
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/SI/Qwen2_VL_7B/task_results.json b/static/eval_results/SI/Qwen2_VL_7B/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..2a50f6e2038e76e1c2ba23de9f9b41e73cd16852
--- /dev/null
+++ b/static/eval_results/SI/Qwen2_VL_7B/task_results.json
@@ -0,0 +1,2207 @@
+[
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.08607142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.5306122448979592,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.64,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.269528925627328,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.12148848554948376,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "dvqa",
+ "score": 0.631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.7309631423965609,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.8701298701298701,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.20139246340756983,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.4426666666666668,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.848854419078294,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.6222222222222221,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.9705882352941178,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.3684210526315789,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.7178947368421053,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.738095238095238,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.6557012145398754,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "figureqa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.4413684210526315,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.7007142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.42105263157894735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.5857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_parity",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.78125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.6517857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.892857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "algebra",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.9662571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.3987142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.1507488668719037,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.5190473684210527,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "iconqa",
+ "score": 0.42105263157894735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.5089285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.05555555555555555,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.08571428571428572,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.09999999999999999,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.042857142857142864,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.028571428571428574,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.06598639455782314,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.43082396072826695,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.0801904200671749,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.863095238095238,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.04081632653061224,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.27993884030889277,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.45,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.10204081632653061,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.6842105263157895,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counting",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.25252525252525254,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.296078431372549,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.3027852300484236,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.06872294372294371,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_execution",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.2619047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.03355324641748354,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.15789473684210525,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.29411764705882354,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.21428571428571433,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.11764705882352941,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "rebus",
+ "score": 0.08695652173913043,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6705882352941176,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.6666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.28852040816326535,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.30960884353741497,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.8214285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.29411764705882354,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.254,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.5857142857142856,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.742857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.65,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.5172413793103449,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.6071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.6241379310344827,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.5499999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.7172413793103447,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.6137931034482759,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.7071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.44137931034482775,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "iq_test",
+ "score": 0.3551724137931035,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.48620689655172405,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.6225806451612903,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.5,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.10714285714285716,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.32,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.7038461538461539,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.4555555555555556,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.30714285714285705,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.48965517241379297,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.7631578947368421,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.3413793103448276,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.18571428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.32142857142857145,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.1928571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.39285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.05714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.2642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.1285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.4714285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.2642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.4785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.75,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.6700000000000002,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.615,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.9,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.29,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.42857142857142855,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/SI/SmolVLM/summary_results.json b/static/eval_results/SI/SmolVLM/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..97be21070ed94838e45c9cd7983b884ba1236b63
--- /dev/null
+++ b/static/eval_results/SI/SmolVLM/summary_results.json
@@ -0,0 +1,219 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 273,
+ "num_eval_samples": 4116,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.07348385181460795,
+ "micro_mean_score": 0.0732694668402814,
+ "missing_tasks": []
+ },
+ "open": {
+ "num_eval_tasks": 42,
+ "num_eval_samples": 813,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.2427337975725658,
+ "micro_mean_score": 0.2504920049200492,
+ "missing_tasks": []
+ },
+ "overall_score": 0.09605051124900234
+ },
+ "keyword_stats": {
+ "skills": {
+ "Text Recognition (OCR)": {
+ "count": 101,
+ "num_samples": 1687,
+ "tasks": [],
+ "average_score": 0.08610257462374318
+ },
+ "Object Recognition and Classification": {
+ "count": 172,
+ "num_samples": 2714,
+ "tasks": [],
+ "average_score": 0.10501451629704919
+ },
+ "Scene and Event Understanding": {
+ "count": 60,
+ "num_samples": 1004,
+ "tasks": [],
+ "average_score": 0.12403047579230878
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 91,
+ "num_samples": 1630,
+ "tasks": [],
+ "average_score": 0.0865768026006882
+ },
+ "Language Understanding and Generation": {
+ "count": 102,
+ "num_samples": 1713,
+ "tasks": [],
+ "average_score": 0.12889143083611815
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 46,
+ "num_samples": 897,
+ "tasks": [],
+ "average_score": 0.077851045512787
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 78,
+ "num_samples": 1273,
+ "tasks": [],
+ "average_score": 0.061765081348496016
+ },
+ "Planning and Decision Making": {
+ "count": 23,
+ "num_samples": 356,
+ "tasks": [],
+ "average_score": 0.008178053830227744
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 38,
+ "num_samples": 654,
+ "tasks": [],
+ "average_score": 0.1293055688222371
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 10,
+ "num_samples": 170,
+ "tasks": [],
+ "average_score": 0.2222067669172932
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 67,
+ "num_samples": 1123,
+ "tasks": [],
+ "average_score": 0.029842216842698305
+ },
+ "Text-Based Images and Documents": {
+ "count": 53,
+ "num_samples": 847,
+ "tasks": [],
+ "average_score": 0.09044016512537822
+ },
+ "Photographs": {
+ "count": 83,
+ "num_samples": 1315,
+ "tasks": [],
+ "average_score": 0.1383108182448921
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 88,
+ "num_samples": 1524,
+ "tasks": [],
+ "average_score": 0.0979843882877799
+ },
+ "Artistic and Creative Content": {
+ "count": 22,
+ "num_samples": 389,
+ "tasks": [],
+ "average_score": 0.143657576543239
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 2,
+ "num_samples": 30,
+ "tasks": [],
+ "average_score": 0.10013149786398344
+ }
+ },
+ "output_format": {
+ "structured_output": {
+ "count": 72,
+ "num_samples": 1121,
+ "tasks": [],
+ "average_score": 0.015386904208215372
+ },
+ "contextual_formatted_text": {
+ "count": 63,
+ "num_samples": 975,
+ "tasks": [],
+ "average_score": 0.12682789970863723
+ },
+ "exact_text": {
+ "count": 57,
+ "num_samples": 880,
+ "tasks": [],
+ "average_score": 0.05128016118728194
+ },
+ "numerical_data": {
+ "count": 39,
+ "num_samples": 694,
+ "tasks": [],
+ "average_score": 0.09999979828107199
+ },
+ "open_ended_output": {
+ "count": 51,
+ "num_samples": 991,
+ "tasks": [],
+ "average_score": 0.21315705831839693
+ },
+ "multiple_choice": {
+ "count": 33,
+ "num_samples": 567,
+ "tasks": [],
+ "average_score": 0.10496742314924135
+ }
+ },
+ "input_num": {
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.09605051124900241
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 41,
+ "num_samples": 644,
+ "tasks": [],
+ "average_score": 0.03906165844850793
+ },
+ "Planning": {
+ "count": 44,
+ "num_samples": 714,
+ "tasks": [],
+ "average_score": 0.03272316763696074
+ },
+ "Coding": {
+ "count": 16,
+ "num_samples": 244,
+ "tasks": [],
+ "average_score": 0.05390625
+ },
+ "Perception": {
+ "count": 82,
+ "num_samples": 1321,
+ "tasks": [],
+ "average_score": 0.1606753925138995
+ },
+ "Metrics": {
+ "count": 3,
+ "num_samples": 45,
+ "tasks": [],
+ "average_score": 0.2222222222222222
+ },
+ "Science": {
+ "count": 22,
+ "num_samples": 469,
+ "tasks": [],
+ "average_score": 0.13950042461525716
+ },
+ "Knowledge": {
+ "count": 77,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.09639506190200878
+ },
+ "Mathematics": {
+ "count": 30,
+ "num_samples": 497,
+ "tasks": [],
+ "average_score": 0.06728619034079576
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/SI/SmolVLM/task_results.json b/static/eval_results/SI/SmolVLM/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..df3d9af11d432c21a23c5e3b469fa763a71857e6
--- /dev/null
+++ b/static/eval_results/SI/SmolVLM/task_results.json
@@ -0,0 +1,2207 @@
+[
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.09090909090909091,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.47058823529411764,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.06919903502760565,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.1,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "counting",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.11764705882352941,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.017857142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_execution",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.6785714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.04081632653061224,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "rebus",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.42105263157894735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "iconqa",
+ "score": 0.631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.10526315789473684,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.057405852870824024,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_parity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.02,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.4638221428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "dvqa",
+ "score": 0.631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "figureqa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "algebra",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.060476150610554294,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.05555555555555555,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.5714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.29285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.038000000000000006,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.5285714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.5684210526315789,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.09999999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.1793103448275862,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.2928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.11379310344827588,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.04137931034482759,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.28965517241379307,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.15714285714285717,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.6461538461538462,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.5142857142857141,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.16,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.5290322580645161,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31
+ },
+ {
+ "name": "iq_test",
+ "score": 0.25172413793103443,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.1448275862068966,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.4357142857142858,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.09655172413793105,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.19310344827586207,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.14827586206896554,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.1482758620689655,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.3642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.08571428571428572,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.23571428571428577,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.19000000000000003,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.8250000000000002,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.14210526315789482,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.54,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.615,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.5842105263157894,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.05714285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.049999999999999996,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.014285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.049999999999999996,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/SI/all_model_keywords_stats.json b/static/eval_results/SI/all_model_keywords_stats.json
deleted file mode 100644
index 91397f9b11ec753969773236bd3e64dd1aee80e2..0000000000000000000000000000000000000000
--- a/static/eval_results/SI/all_model_keywords_stats.json
+++ /dev/null
@@ -1,5348 +0,0 @@
-{
- "Aquila_VL_2B": {
- "app": {
- "Coding": {
- "count": 16,
- "num_samples": 244,
- "tasks": [],
- "average_score": 0.23446107609710548
- },
- "Information_Extraction": {
- "count": 41,
- "num_samples": 644,
- "tasks": [],
- "average_score": 0.08500232938689507
- },
- "Knowledge": {
- "count": 77,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.2736043135287443
- },
- "Mathematics": {
- "count": 30,
- "num_samples": 497,
- "tasks": [],
- "average_score": 0.19099680045595863
- },
- "Metrics": {
- "count": 3,
- "num_samples": 45,
- "tasks": [],
- "average_score": 0.39206349206349206
- },
- "Perception": {
- "count": 82,
- "num_samples": 1321,
- "tasks": [],
- "average_score": 0.3004030430829456
- },
- "Planning": {
- "count": 44,
- "num_samples": 714,
- "tasks": [],
- "average_score": 0.08421801129956362
- },
- "Science": {
- "count": 22,
- "num_samples": 469,
- "tasks": [],
- "average_score": 0.2897054521388083
- }
- },
- "input_format": {
- "3D Models and Aerial Imagery": {
- "count": 2,
- "num_samples": 30,
- "tasks": [],
- "average_score": 0.10279080594456047
- },
- "Artistic and Creative Content": {
- "count": 22,
- "num_samples": 389,
- "tasks": [],
- "average_score": 0.3078950207372175
- },
- "Diagrams and Data Visualizations": {
- "count": 88,
- "num_samples": 1524,
- "tasks": [],
- "average_score": 0.2248398559924241
- },
- "Photographs": {
- "count": 83,
- "num_samples": 1315,
- "tasks": [],
- "average_score": 0.3533180891172854
- },
- "Text-Based Images and Documents": {
- "count": 53,
- "num_samples": 847,
- "tasks": [],
- "average_score": 0.11430966292465267
- },
- "User Interface Screenshots": {
- "count": 67,
- "num_samples": 1123,
- "tasks": [],
- "average_score": 0.11601893140078427
- }
- },
- "input_num": {
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.2219754327969366
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 63,
- "num_samples": 975,
- "tasks": [],
- "average_score": 0.1772030496280578
- },
- "exact_text": {
- "count": 57,
- "num_samples": 880,
- "tasks": [],
- "average_score": 0.1884228017877996
- },
- "multiple_choice": {
- "count": 33,
- "num_samples": 567,
- "tasks": [],
- "average_score": 0.43875114784205704
- },
- "numerical_data": {
- "count": 39,
- "num_samples": 694,
- "tasks": [],
- "average_score": 0.23519563962981577
- },
- "open_ended_output": {
- "count": 51,
- "num_samples": 991,
- "tasks": [],
- "average_score": 0.28092356180071465
- },
- "structured_output": {
- "count": 72,
- "num_samples": 1121,
- "tasks": [],
- "average_score": 0.13944236147744013
- }
- },
- "skills": {
- "Commonsense and Social Reasoning": {
- "count": 38,
- "num_samples": 654,
- "tasks": [],
- "average_score": 0.3826225373137124
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 46,
- "num_samples": 897,
- "tasks": [],
- "average_score": 0.20221672149607509
- },
- "Ethical and Safety Reasoning": {
- "count": 10,
- "num_samples": 170,
- "tasks": [],
- "average_score": 0.6020225563909773
- },
- "Language Understanding and Generation": {
- "count": 102,
- "num_samples": 1713,
- "tasks": [],
- "average_score": 0.2521179990443663
- },
- "Mathematical and Logical Reasoning": {
- "count": 91,
- "num_samples": 1630,
- "tasks": [],
- "average_score": 0.19504930283108274
- },
- "Object Recognition and Classification": {
- "count": 172,
- "num_samples": 2714,
- "tasks": [],
- "average_score": 0.2374462987863378
- },
- "Planning and Decision Making": {
- "count": 23,
- "num_samples": 356,
- "tasks": [],
- "average_score": 0.0625675073438388
- },
- "Scene and Event Understanding": {
- "count": 60,
- "num_samples": 1004,
- "tasks": [],
- "average_score": 0.3521969849344277
- },
- "Spatial and Temporal Reasoning": {
- "count": 78,
- "num_samples": 1273,
- "tasks": [],
- "average_score": 0.18502360430789122
- },
- "Text Recognition (OCR)": {
- "count": 101,
- "num_samples": 1687,
- "tasks": [],
- "average_score": 0.17480107496737848
- }
- }
- },
- "Aria": {
- "app": {
- "Coding": {
- "count": 16,
- "num_samples": 244,
- "tasks": [],
- "average_score": 0.38003253384687213
- },
- "Information_Extraction": {
- "count": 41,
- "num_samples": 644,
- "tasks": [],
- "average_score": 0.33746818901184633
- },
- "Knowledge": {
- "count": 77,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.4097428531166082
- },
- "Mathematics": {
- "count": 30,
- "num_samples": 497,
- "tasks": [],
- "average_score": 0.22745674367681176
- },
- "Metrics": {
- "count": 3,
- "num_samples": 45,
- "tasks": [],
- "average_score": 0.4142857142857143
- },
- "Perception": {
- "count": 82,
- "num_samples": 1321,
- "tasks": [],
- "average_score": 0.4433718463877228
- },
- "Planning": {
- "count": 44,
- "num_samples": 714,
- "tasks": [],
- "average_score": 0.10860172719687727
- },
- "Science": {
- "count": 22,
- "num_samples": 469,
- "tasks": [],
- "average_score": 0.3496496998103286
- }
- },
- "input_format": {
- "3D Models and Aerial Imagery": {
- "count": 2,
- "num_samples": 30,
- "tasks": [],
- "average_score": 0.04960831797041802
- },
- "Artistic and Creative Content": {
- "count": 22,
- "num_samples": 389,
- "tasks": [],
- "average_score": 0.40912566596786665
- },
- "Diagrams and Data Visualizations": {
- "count": 88,
- "num_samples": 1524,
- "tasks": [],
- "average_score": 0.3300885226603808
- },
- "Photographs": {
- "count": 83,
- "num_samples": 1315,
- "tasks": [],
- "average_score": 0.45572004760273754
- },
- "Text-Based Images and Documents": {
- "count": 53,
- "num_samples": 847,
- "tasks": [],
- "average_score": 0.259572791833904
- },
- "User Interface Screenshots": {
- "count": 67,
- "num_samples": 1123,
- "tasks": [],
- "average_score": 0.27807228404309764
- }
- },
- "input_num": {
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.3440023372395526
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 63,
- "num_samples": 975,
- "tasks": [],
- "average_score": 0.3053148323646246
- },
- "exact_text": {
- "count": 57,
- "num_samples": 880,
- "tasks": [],
- "average_score": 0.2579833154471113
- },
- "multiple_choice": {
- "count": 33,
- "num_samples": 567,
- "tasks": [],
- "average_score": 0.4787572696663607
- },
- "numerical_data": {
- "count": 39,
- "num_samples": 694,
- "tasks": [],
- "average_score": 0.3082165471908181
- },
- "open_ended_output": {
- "count": 51,
- "num_samples": 991,
- "tasks": [],
- "average_score": 0.45805038774421686
- },
- "structured_output": {
- "count": 72,
- "num_samples": 1121,
- "tasks": [],
- "average_score": 0.3227895527307711
- }
- },
- "skills": {
- "Commonsense and Social Reasoning": {
- "count": 38,
- "num_samples": 654,
- "tasks": [],
- "average_score": 0.5240018518464876
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 46,
- "num_samples": 897,
- "tasks": [],
- "average_score": 0.3401734439719901
- },
- "Ethical and Safety Reasoning": {
- "count": 10,
- "num_samples": 170,
- "tasks": [],
- "average_score": 0.7129097744360902
- },
- "Language Understanding and Generation": {
- "count": 102,
- "num_samples": 1713,
- "tasks": [],
- "average_score": 0.40684369400912745
- },
- "Mathematical and Logical Reasoning": {
- "count": 91,
- "num_samples": 1630,
- "tasks": [],
- "average_score": 0.300830802045758
- },
- "Object Recognition and Classification": {
- "count": 172,
- "num_samples": 2714,
- "tasks": [],
- "average_score": 0.33433893000455434
- },
- "Planning and Decision Making": {
- "count": 23,
- "num_samples": 356,
- "tasks": [],
- "average_score": 0.07560632809892315
- },
- "Scene and Event Understanding": {
- "count": 60,
- "num_samples": 1004,
- "tasks": [],
- "average_score": 0.49083567506460973
- },
- "Spatial and Temporal Reasoning": {
- "count": 78,
- "num_samples": 1273,
- "tasks": [],
- "average_score": 0.22595636868728874
- },
- "Text Recognition (OCR)": {
- "count": 101,
- "num_samples": 1687,
- "tasks": [],
- "average_score": 0.3653361644690575
- }
- }
- },
- "Claude_3.5": {
- "app": {
- "Coding": {
- "count": 16,
- "num_samples": 244,
- "tasks": [],
- "average_score": 0.6124985410830999
- },
- "Information_Extraction": {
- "count": 41,
- "num_samples": 644,
- "tasks": [],
- "average_score": 0.6692574633083122
- },
- "Knowledge": {
- "count": 77,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.5401030980230185
- },
- "Mathematics": {
- "count": 30,
- "num_samples": 497,
- "tasks": [],
- "average_score": 0.4760293511799448
- },
- "Metrics": {
- "count": 3,
- "num_samples": 45,
- "tasks": [],
- "average_score": 0.4174603174603175
- },
- "Perception": {
- "count": 82,
- "num_samples": 1321,
- "tasks": [],
- "average_score": 0.6061759059165749
- },
- "Planning": {
- "count": 44,
- "num_samples": 714,
- "tasks": [],
- "average_score": 0.315623741632974
- },
- "Science": {
- "count": 22,
- "num_samples": 469,
- "tasks": [],
- "average_score": 0.5134329832846579
- }
- },
- "input_format": {
- "3D Models and Aerial Imagery": {
- "count": 2,
- "num_samples": 30,
- "tasks": [],
- "average_score": 0.34512576094802216
- },
- "Artistic and Creative Content": {
- "count": 22,
- "num_samples": 389,
- "tasks": [],
- "average_score": 0.6014068374421209
- },
- "Diagrams and Data Visualizations": {
- "count": 88,
- "num_samples": 1524,
- "tasks": [],
- "average_score": 0.5589506892621444
- },
- "Photographs": {
- "count": 83,
- "num_samples": 1315,
- "tasks": [],
- "average_score": 0.5314705050989759
- },
- "Text-Based Images and Documents": {
- "count": 53,
- "num_samples": 847,
- "tasks": [],
- "average_score": 0.4753194125515341
- },
- "User Interface Screenshots": {
- "count": 67,
- "num_samples": 1123,
- "tasks": [],
- "average_score": 0.54981020669637
- }
- },
- "input_num": {
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.5373019912310933
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 63,
- "num_samples": 975,
- "tasks": [],
- "average_score": 0.5072889926389097
- },
- "exact_text": {
- "count": 57,
- "num_samples": 880,
- "tasks": [],
- "average_score": 0.5112348724553849
- },
- "multiple_choice": {
- "count": 33,
- "num_samples": 567,
- "tasks": [],
- "average_score": 0.6164633346451529
- },
- "numerical_data": {
- "count": 39,
- "num_samples": 694,
- "tasks": [],
- "average_score": 0.4712835541311676
- },
- "open_ended_output": {
- "count": 51,
- "num_samples": 991,
- "tasks": [],
- "average_score": 0.5769294912151234
- },
- "structured_output": {
- "count": 72,
- "num_samples": 1121,
- "tasks": [],
- "average_score": 0.5556080592390198
- }
- },
- "skills": {
- "Commonsense and Social Reasoning": {
- "count": 38,
- "num_samples": 654,
- "tasks": [],
- "average_score": 0.6017116084931068
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 46,
- "num_samples": 897,
- "tasks": [],
- "average_score": 0.530309401925396
- },
- "Ethical and Safety Reasoning": {
- "count": 10,
- "num_samples": 170,
- "tasks": [],
- "average_score": 0.7033233082706767
- },
- "Language Understanding and Generation": {
- "count": 102,
- "num_samples": 1713,
- "tasks": [],
- "average_score": 0.5757222503903228
- },
- "Mathematical and Logical Reasoning": {
- "count": 91,
- "num_samples": 1630,
- "tasks": [],
- "average_score": 0.5044379729567133
- },
- "Object Recognition and Classification": {
- "count": 172,
- "num_samples": 2714,
- "tasks": [],
- "average_score": 0.5499261524919171
- },
- "Planning and Decision Making": {
- "count": 23,
- "num_samples": 356,
- "tasks": [],
- "average_score": 0.19196633042767672
- },
- "Scene and Event Understanding": {
- "count": 60,
- "num_samples": 1004,
- "tasks": [],
- "average_score": 0.636886763741019
- },
- "Spatial and Temporal Reasoning": {
- "count": 78,
- "num_samples": 1273,
- "tasks": [],
- "average_score": 0.4511182385296208
- },
- "Text Recognition (OCR)": {
- "count": 101,
- "num_samples": 1687,
- "tasks": [],
- "average_score": 0.6192941518442948
- }
- }
- },
- "Claude_3.5_new": {
- "app": {
- "Coding": {
- "count": 16,
- "num_samples": 244,
- "tasks": [],
- "average_score": 0.565344887955182
- },
- "Information_Extraction": {
- "count": 41,
- "num_samples": 644,
- "tasks": [],
- "average_score": 0.6633000290867174
- },
- "Knowledge": {
- "count": 77,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.5737128945237007
- },
- "Mathematics": {
- "count": 30,
- "num_samples": 497,
- "tasks": [],
- "average_score": 0.4831956110227109
- },
- "Metrics": {
- "count": 3,
- "num_samples": 45,
- "tasks": [],
- "average_score": 0.6285714285714286
- },
- "Perception": {
- "count": 82,
- "num_samples": 1321,
- "tasks": [],
- "average_score": 0.6465631513465354
- },
- "Planning": {
- "count": 44,
- "num_samples": 714,
- "tasks": [],
- "average_score": 0.3511145464456188
- },
- "Science": {
- "count": 22,
- "num_samples": 469,
- "tasks": [],
- "average_score": 0.5580232103280633
- }
- },
- "input_format": {
- "3D Models and Aerial Imagery": {
- "count": 2,
- "num_samples": 30,
- "tasks": [],
- "average_score": 0.3619606028475468
- },
- "Artistic and Creative Content": {
- "count": 22,
- "num_samples": 389,
- "tasks": [],
- "average_score": 0.5927094432064197
- },
- "Diagrams and Data Visualizations": {
- "count": 88,
- "num_samples": 1524,
- "tasks": [],
- "average_score": 0.5899091882733952
- },
- "Photographs": {
- "count": 83,
- "num_samples": 1315,
- "tasks": [],
- "average_score": 0.5838312144672865
- },
- "Text-Based Images and Documents": {
- "count": 53,
- "num_samples": 847,
- "tasks": [],
- "average_score": 0.4705509892153899
- },
- "User Interface Screenshots": {
- "count": 67,
- "num_samples": 1123,
- "tasks": [],
- "average_score": 0.574168555556774
- }
- },
- "input_num": {
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.5636254729390459
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 63,
- "num_samples": 975,
- "tasks": [],
- "average_score": 0.5249488326690246
- },
- "exact_text": {
- "count": 57,
- "num_samples": 880,
- "tasks": [],
- "average_score": 0.5300876558354416
- },
- "multiple_choice": {
- "count": 33,
- "num_samples": 567,
- "tasks": [],
- "average_score": 0.6380252743889108
- },
- "numerical_data": {
- "count": 39,
- "num_samples": 694,
- "tasks": [],
- "average_score": 0.5106873710119535
- },
- "open_ended_output": {
- "count": 51,
- "num_samples": 991,
- "tasks": [],
- "average_score": 0.6409616762702612
- },
- "structured_output": {
- "count": 72,
- "num_samples": 1121,
- "tasks": [],
- "average_score": 0.5638133905687104
- }
- },
- "skills": {
- "Commonsense and Social Reasoning": {
- "count": 38,
- "num_samples": 654,
- "tasks": [],
- "average_score": 0.6433122980573076
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 46,
- "num_samples": 897,
- "tasks": [],
- "average_score": 0.5426169039575065
- },
- "Ethical and Safety Reasoning": {
- "count": 10,
- "num_samples": 170,
- "tasks": [],
- "average_score": 0.6839924812030076
- },
- "Language Understanding and Generation": {
- "count": 102,
- "num_samples": 1713,
- "tasks": [],
- "average_score": 0.6234123112506059
- },
- "Mathematical and Logical Reasoning": {
- "count": 91,
- "num_samples": 1630,
- "tasks": [],
- "average_score": 0.5171075478248572
- },
- "Object Recognition and Classification": {
- "count": 172,
- "num_samples": 2714,
- "tasks": [],
- "average_score": 0.583387314927874
- },
- "Planning and Decision Making": {
- "count": 23,
- "num_samples": 356,
- "tasks": [],
- "average_score": 0.22440221381985706
- },
- "Scene and Event Understanding": {
- "count": 60,
- "num_samples": 1004,
- "tasks": [],
- "average_score": 0.6507240054983652
- },
- "Spatial and Temporal Reasoning": {
- "count": 78,
- "num_samples": 1273,
- "tasks": [],
- "average_score": 0.48795977188332873
- },
- "Text Recognition (OCR)": {
- "count": 101,
- "num_samples": 1687,
- "tasks": [],
- "average_score": 0.6242355223474262
- }
- }
- },
- "GPT_4o": {
- "app": {
- "Coding": {
- "count": 16,
- "num_samples": 244,
- "tasks": [],
- "average_score": 0.5785991479925302
- },
- "Information_Extraction": {
- "count": 41,
- "num_samples": 644,
- "tasks": [],
- "average_score": 0.7387886231372116
- },
- "Knowledge": {
- "count": 77,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.6073751328612617
- },
- "Mathematics": {
- "count": 30,
- "num_samples": 497,
- "tasks": [],
- "average_score": 0.4387500704123191
- },
- "Metrics": {
- "count": 3,
- "num_samples": 45,
- "tasks": [],
- "average_score": 0.626984126984127
- },
- "Perception": {
- "count": 82,
- "num_samples": 1321,
- "tasks": [],
- "average_score": 0.6418126331560571
- },
- "Planning": {
- "count": 44,
- "num_samples": 714,
- "tasks": [],
- "average_score": 0.302146719713088
- },
- "Science": {
- "count": 22,
- "num_samples": 469,
- "tasks": [],
- "average_score": 0.5184702350129554
- }
- },
- "input_format": {
- "3D Models and Aerial Imagery": {
- "count": 2,
- "num_samples": 30,
- "tasks": [],
- "average_score": 0.3427989299648589
- },
- "Artistic and Creative Content": {
- "count": 22,
- "num_samples": 389,
- "tasks": [],
- "average_score": 0.6086090683867454
- },
- "Diagrams and Data Visualizations": {
- "count": 88,
- "num_samples": 1524,
- "tasks": [],
- "average_score": 0.533172482404735
- },
- "Photographs": {
- "count": 83,
- "num_samples": 1315,
- "tasks": [],
- "average_score": 0.6107746700730057
- },
- "Text-Based Images and Documents": {
- "count": 53,
- "num_samples": 847,
- "tasks": [],
- "average_score": 0.4938444672052553
- },
- "User Interface Screenshots": {
- "count": 67,
- "num_samples": 1123,
- "tasks": [],
- "average_score": 0.6093502418300007
- }
- },
- "input_num": {
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.5672657028463585
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 63,
- "num_samples": 975,
- "tasks": [],
- "average_score": 0.5351259728352326
- },
- "exact_text": {
- "count": 57,
- "num_samples": 880,
- "tasks": [],
- "average_score": 0.6016521462358102
- },
- "multiple_choice": {
- "count": 33,
- "num_samples": 567,
- "tasks": [],
- "average_score": 0.6204512659058113
- },
- "numerical_data": {
- "count": 39,
- "num_samples": 694,
- "tasks": [],
- "average_score": 0.4632537848154335
- },
- "open_ended_output": {
- "count": 51,
- "num_samples": 991,
- "tasks": [],
- "average_score": 0.6563556079088679
- },
- "structured_output": {
- "count": 72,
- "num_samples": 1121,
- "tasks": [],
- "average_score": 0.5370230887013343
- }
- },
- "skills": {
- "Commonsense and Social Reasoning": {
- "count": 38,
- "num_samples": 654,
- "tasks": [],
- "average_score": 0.6716375018861761
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 46,
- "num_samples": 897,
- "tasks": [],
- "average_score": 0.5506629280904943
- },
- "Ethical and Safety Reasoning": {
- "count": 10,
- "num_samples": 170,
- "tasks": [],
- "average_score": 0.7342894736842105
- },
- "Language Understanding and Generation": {
- "count": 102,
- "num_samples": 1713,
- "tasks": [],
- "average_score": 0.6512174145248227
- },
- "Mathematical and Logical Reasoning": {
- "count": 91,
- "num_samples": 1630,
- "tasks": [],
- "average_score": 0.47164342831848766
- },
- "Object Recognition and Classification": {
- "count": 172,
- "num_samples": 2714,
- "tasks": [],
- "average_score": 0.5798789532163023
- },
- "Planning and Decision Making": {
- "count": 23,
- "num_samples": 356,
- "tasks": [],
- "average_score": 0.1970421212123289
- },
- "Scene and Event Understanding": {
- "count": 60,
- "num_samples": 1004,
- "tasks": [],
- "average_score": 0.6933181759121947
- },
- "Spatial and Temporal Reasoning": {
- "count": 78,
- "num_samples": 1273,
- "tasks": [],
- "average_score": 0.4267383416112408
- },
- "Text Recognition (OCR)": {
- "count": 101,
- "num_samples": 1687,
- "tasks": [],
- "average_score": 0.6400436962819274
- }
- }
- },
- "GPT_4o_mini": {
- "app": {
- "Coding": {
- "count": 16,
- "num_samples": 244,
- "tasks": [],
- "average_score": 0.4556095354808589
- },
- "Information_Extraction": {
- "count": 41,
- "num_samples": 644,
- "tasks": [],
- "average_score": 0.5484747566251307
- },
- "Knowledge": {
- "count": 77,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.535145025177205
- },
- "Mathematics": {
- "count": 30,
- "num_samples": 497,
- "tasks": [],
- "average_score": 0.33759329198549914
- },
- "Metrics": {
- "count": 3,
- "num_samples": 45,
- "tasks": [],
- "average_score": 0.4873015873015873
- },
- "Perception": {
- "count": 82,
- "num_samples": 1321,
- "tasks": [],
- "average_score": 0.5437015929631214
- },
- "Planning": {
- "count": 44,
- "num_samples": 714,
- "tasks": [],
- "average_score": 0.22983305008250185
- },
- "Science": {
- "count": 22,
- "num_samples": 469,
- "tasks": [],
- "average_score": 0.39601047285667923
- }
- },
- "input_format": {
- "3D Models and Aerial Imagery": {
- "count": 2,
- "num_samples": 30,
- "tasks": [],
- "average_score": 0.1194248916897328
- },
- "Artistic and Creative Content": {
- "count": 22,
- "num_samples": 389,
- "tasks": [],
- "average_score": 0.5198662454862603
- },
- "Diagrams and Data Visualizations": {
- "count": 88,
- "num_samples": 1524,
- "tasks": [],
- "average_score": 0.4194828137611333
- },
- "Photographs": {
- "count": 83,
- "num_samples": 1315,
- "tasks": [],
- "average_score": 0.5569877095654321
- },
- "Text-Based Images and Documents": {
- "count": 53,
- "num_samples": 847,
- "tasks": [],
- "average_score": 0.3779902828155749
- },
- "User Interface Screenshots": {
- "count": 67,
- "num_samples": 1123,
- "tasks": [],
- "average_score": 0.4645916955325127
- }
- },
- "input_num": {
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.46343334374251277
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 63,
- "num_samples": 975,
- "tasks": [],
- "average_score": 0.38282644938937405
- },
- "exact_text": {
- "count": 57,
- "num_samples": 880,
- "tasks": [],
- "average_score": 0.42048902061937554
- },
- "multiple_choice": {
- "count": 33,
- "num_samples": 567,
- "tasks": [],
- "average_score": 0.5559184922821285
- },
- "numerical_data": {
- "count": 39,
- "num_samples": 694,
- "tasks": [],
- "average_score": 0.3777213713726476
- },
- "open_ended_output": {
- "count": 51,
- "num_samples": 991,
- "tasks": [],
- "average_score": 0.5986898724975707
- },
- "structured_output": {
- "count": 72,
- "num_samples": 1121,
- "tasks": [],
- "average_score": 0.4761935495255144
- }
- },
- "skills": {
- "Commonsense and Social Reasoning": {
- "count": 38,
- "num_samples": 654,
- "tasks": [],
- "average_score": 0.5775026308600164
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 46,
- "num_samples": 897,
- "tasks": [],
- "average_score": 0.4555977624507237
- },
- "Ethical and Safety Reasoning": {
- "count": 10,
- "num_samples": 170,
- "tasks": [],
- "average_score": 0.7960714285714285
- },
- "Language Understanding and Generation": {
- "count": 102,
- "num_samples": 1713,
- "tasks": [],
- "average_score": 0.5458509360302554
- },
- "Mathematical and Logical Reasoning": {
- "count": 91,
- "num_samples": 1630,
- "tasks": [],
- "average_score": 0.37680368570252215
- },
- "Object Recognition and Classification": {
- "count": 172,
- "num_samples": 2714,
- "tasks": [],
- "average_score": 0.48241878593503174
- },
- "Planning and Decision Making": {
- "count": 23,
- "num_samples": 356,
- "tasks": [],
- "average_score": 0.17294565844175996
- },
- "Scene and Event Understanding": {
- "count": 60,
- "num_samples": 1004,
- "tasks": [],
- "average_score": 0.5987052352447554
- },
- "Spatial and Temporal Reasoning": {
- "count": 78,
- "num_samples": 1273,
- "tasks": [],
- "average_score": 0.33277278942510824
- },
- "Text Recognition (OCR)": {
- "count": 101,
- "num_samples": 1687,
- "tasks": [],
- "average_score": 0.503118803606002
- }
- }
- },
- "Gemini_1.5_flash_002": {
- "app": {
- "Coding": {
- "count": 16,
- "num_samples": 244,
- "tasks": [],
- "average_score": 0.47487599206349207
- },
- "Information_Extraction": {
- "count": 41,
- "num_samples": 644,
- "tasks": [],
- "average_score": 0.45245079667466714
- },
- "Knowledge": {
- "count": 77,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.5086518140501541
- },
- "Mathematics": {
- "count": 30,
- "num_samples": 497,
- "tasks": [],
- "average_score": 0.3853815223607656
- },
- "Metrics": {
- "count": 3,
- "num_samples": 45,
- "tasks": [],
- "average_score": 0.4380952380952381
- },
- "Perception": {
- "count": 82,
- "num_samples": 1321,
- "tasks": [],
- "average_score": 0.5468998820129136
- },
- "Planning": {
- "count": 44,
- "num_samples": 714,
- "tasks": [],
- "average_score": 0.21148887498941377
- },
- "Science": {
- "count": 22,
- "num_samples": 469,
- "tasks": [],
- "average_score": 0.48499051643275837
- }
- },
- "input_format": {
- "3D Models and Aerial Imagery": {
- "count": 2,
- "num_samples": 30,
- "tasks": [],
- "average_score": 0.3348446026637953
- },
- "Artistic and Creative Content": {
- "count": 22,
- "num_samples": 389,
- "tasks": [],
- "average_score": 0.5535202379362348
- },
- "Diagrams and Data Visualizations": {
- "count": 88,
- "num_samples": 1524,
- "tasks": [],
- "average_score": 0.46724590271207767
- },
- "Photographs": {
- "count": 83,
- "num_samples": 1315,
- "tasks": [],
- "average_score": 0.5613400178213946
- },
- "Text-Based Images and Documents": {
- "count": 53,
- "num_samples": 847,
- "tasks": [],
- "average_score": 0.33052002642818507
- },
- "User Interface Screenshots": {
- "count": 67,
- "num_samples": 1123,
- "tasks": [],
- "average_score": 0.3722082840493195
- }
- },
- "input_num": {
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.45400479933257654
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 63,
- "num_samples": 975,
- "tasks": [],
- "average_score": 0.3691249729531883
- },
- "exact_text": {
- "count": 57,
- "num_samples": 880,
- "tasks": [],
- "average_score": 0.42013434507914493
- },
- "multiple_choice": {
- "count": 33,
- "num_samples": 567,
- "tasks": [],
- "average_score": 0.5905636451090996
- },
- "numerical_data": {
- "count": 39,
- "num_samples": 694,
- "tasks": [],
- "average_score": 0.43247267273235235
- },
- "open_ended_output": {
- "count": 51,
- "num_samples": 991,
- "tasks": [],
- "average_score": 0.5470781816319514
- },
- "structured_output": {
- "count": 72,
- "num_samples": 1121,
- "tasks": [],
- "average_score": 0.43823554216399857
- }
- },
- "skills": {
- "Commonsense and Social Reasoning": {
- "count": 38,
- "num_samples": 654,
- "tasks": [],
- "average_score": 0.5955368143490581
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 46,
- "num_samples": 897,
- "tasks": [],
- "average_score": 0.4655431430975485
- },
- "Ethical and Safety Reasoning": {
- "count": 10,
- "num_samples": 170,
- "tasks": [],
- "average_score": 0.7948947368421052
- },
- "Language Understanding and Generation": {
- "count": 102,
- "num_samples": 1713,
- "tasks": [],
- "average_score": 0.5122400421391089
- },
- "Mathematical and Logical Reasoning": {
- "count": 91,
- "num_samples": 1630,
- "tasks": [],
- "average_score": 0.4086167264646781
- },
- "Object Recognition and Classification": {
- "count": 172,
- "num_samples": 2714,
- "tasks": [],
- "average_score": 0.47630441828533016
- },
- "Planning and Decision Making": {
- "count": 23,
- "num_samples": 356,
- "tasks": [],
- "average_score": 0.09741974015331743
- },
- "Scene and Event Understanding": {
- "count": 60,
- "num_samples": 1004,
- "tasks": [],
- "average_score": 0.5920539115535787
- },
- "Spatial and Temporal Reasoning": {
- "count": 78,
- "num_samples": 1273,
- "tasks": [],
- "average_score": 0.3559690476405975
- },
- "Text Recognition (OCR)": {
- "count": 101,
- "num_samples": 1687,
- "tasks": [],
- "average_score": 0.4474763430506795
- }
- }
- },
- "Gemini_1.5_pro_002": {
- "app": {
- "Coding": {
- "count": 16,
- "num_samples": 244,
- "tasks": [],
- "average_score": 0.48587549603174607
- },
- "Information_Extraction": {
- "count": 41,
- "num_samples": 644,
- "tasks": [],
- "average_score": 0.504539358390968
- },
- "Knowledge": {
- "count": 77,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.5660366627264668
- },
- "Mathematics": {
- "count": 30,
- "num_samples": 497,
- "tasks": [],
- "average_score": 0.4200866579901879
- },
- "Metrics": {
- "count": 3,
- "num_samples": 45,
- "tasks": [],
- "average_score": 0.48888888888888893
- },
- "Perception": {
- "count": 82,
- "num_samples": 1321,
- "tasks": [],
- "average_score": 0.5964613809728712
- },
- "Planning": {
- "count": 44,
- "num_samples": 714,
- "tasks": [],
- "average_score": 0.28536490696494377
- },
- "Science": {
- "count": 22,
- "num_samples": 469,
- "tasks": [],
- "average_score": 0.500158537824293
- }
- },
- "input_format": {
- "3D Models and Aerial Imagery": {
- "count": 2,
- "num_samples": 30,
- "tasks": [],
- "average_score": 0.3592697030984118
- },
- "Artistic and Creative Content": {
- "count": 22,
- "num_samples": 389,
- "tasks": [],
- "average_score": 0.6217290675275775
- },
- "Diagrams and Data Visualizations": {
- "count": 88,
- "num_samples": 1524,
- "tasks": [],
- "average_score": 0.5132563067393096
- },
- "Photographs": {
- "count": 83,
- "num_samples": 1315,
- "tasks": [],
- "average_score": 0.5888558035357285
- },
- "Text-Based Images and Documents": {
- "count": 53,
- "num_samples": 847,
- "tasks": [],
- "average_score": 0.4060403716629095
- },
- "User Interface Screenshots": {
- "count": 67,
- "num_samples": 1123,
- "tasks": [],
- "average_score": 0.42724302639929596
- }
- },
- "input_num": {
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.5034399620483027
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 63,
- "num_samples": 975,
- "tasks": [],
- "average_score": 0.43754003302746525
- },
- "exact_text": {
- "count": 57,
- "num_samples": 880,
- "tasks": [],
- "average_score": 0.4731762319443037
- },
- "multiple_choice": {
- "count": 33,
- "num_samples": 567,
- "tasks": [],
- "average_score": 0.6245091608727974
- },
- "numerical_data": {
- "count": 39,
- "num_samples": 694,
- "tasks": [],
- "average_score": 0.48334866543174226
- },
- "open_ended_output": {
- "count": 51,
- "num_samples": 991,
- "tasks": [],
- "average_score": 0.5644701189535662
- },
- "structured_output": {
- "count": 72,
- "num_samples": 1121,
- "tasks": [],
- "average_score": 0.4972242280053817
- }
- },
- "skills": {
- "Commonsense and Social Reasoning": {
- "count": 38,
- "num_samples": 654,
- "tasks": [],
- "average_score": 0.5995804836966744
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 46,
- "num_samples": 897,
- "tasks": [],
- "average_score": 0.5090111123207751
- },
- "Ethical and Safety Reasoning": {
- "count": 10,
- "num_samples": 170,
- "tasks": [],
- "average_score": 0.7830639097744362
- },
- "Language Understanding and Generation": {
- "count": 102,
- "num_samples": 1713,
- "tasks": [],
- "average_score": 0.5647567827649111
- },
- "Mathematical and Logical Reasoning": {
- "count": 91,
- "num_samples": 1630,
- "tasks": [],
- "average_score": 0.448099634405986
- },
- "Object Recognition and Classification": {
- "count": 172,
- "num_samples": 2714,
- "tasks": [],
- "average_score": 0.5220033468415737
- },
- "Planning and Decision Making": {
- "count": 23,
- "num_samples": 356,
- "tasks": [],
- "average_score": 0.178032259819607
- },
- "Scene and Event Understanding": {
- "count": 60,
- "num_samples": 1004,
- "tasks": [],
- "average_score": 0.6342882147970302
- },
- "Spatial and Temporal Reasoning": {
- "count": 78,
- "num_samples": 1273,
- "tasks": [],
- "average_score": 0.3972807544005462
- },
- "Text Recognition (OCR)": {
- "count": 101,
- "num_samples": 1687,
- "tasks": [],
- "average_score": 0.5000257619938475
- }
- }
- },
- "Idefics3": {
- "app": {
- "Coding": {
- "count": 16,
- "num_samples": 244,
- "tasks": [],
- "average_score": 0.10420386904761905
- },
- "Information_Extraction": {
- "count": 41,
- "num_samples": 644,
- "tasks": [],
- "average_score": 0.03610947192711297
- },
- "Knowledge": {
- "count": 77,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.14759816564804443
- },
- "Mathematics": {
- "count": 30,
- "num_samples": 497,
- "tasks": [],
- "average_score": 0.07952603609985566
- },
- "Metrics": {
- "count": 3,
- "num_samples": 45,
- "tasks": [],
- "average_score": 0.19999999999999998
- },
- "Perception": {
- "count": 82,
- "num_samples": 1321,
- "tasks": [],
- "average_score": 0.17708549842279478
- },
- "Planning": {
- "count": 44,
- "num_samples": 714,
- "tasks": [],
- "average_score": 0.04525221984520586
- },
- "Science": {
- "count": 22,
- "num_samples": 469,
- "tasks": [],
- "average_score": 0.1804888391778344
- }
- },
- "input_format": {
- "3D Models and Aerial Imagery": {
- "count": 2,
- "num_samples": 30,
- "tasks": [],
- "average_score": 0.020659062938075456
- },
- "Artistic and Creative Content": {
- "count": 22,
- "num_samples": 389,
- "tasks": [],
- "average_score": 0.21050154891192577
- },
- "Diagrams and Data Visualizations": {
- "count": 88,
- "num_samples": 1524,
- "tasks": [],
- "average_score": 0.14766910173600153
- },
- "Photographs": {
- "count": 83,
- "num_samples": 1315,
- "tasks": [],
- "average_score": 0.171712228743858
- },
- "Text-Based Images and Documents": {
- "count": 53,
- "num_samples": 847,
- "tasks": [],
- "average_score": 0.06561871098996794
- },
- "User Interface Screenshots": {
- "count": 67,
- "num_samples": 1123,
- "tasks": [],
- "average_score": 0.03857183991826921
- }
- },
- "input_num": {
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.12057604157917215
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 63,
- "num_samples": 975,
- "tasks": [],
- "average_score": 0.15091196450213815
- },
- "exact_text": {
- "count": 57,
- "num_samples": 880,
- "tasks": [],
- "average_score": 0.053829016986911726
- },
- "multiple_choice": {
- "count": 33,
- "num_samples": 567,
- "tasks": [],
- "average_score": 0.13726004635095543
- },
- "numerical_data": {
- "count": 39,
- "num_samples": 694,
- "tasks": [],
- "average_score": 0.10744987600153451
- },
- "open_ended_output": {
- "count": 51,
- "num_samples": 991,
- "tasks": [],
- "average_score": 0.2975217887286715
- },
- "structured_output": {
- "count": 72,
- "num_samples": 1121,
- "tasks": [],
- "average_score": 0.02100010342704044
- }
- },
- "skills": {
- "Commonsense and Social Reasoning": {
- "count": 38,
- "num_samples": 654,
- "tasks": [],
- "average_score": 0.2126465842330819
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 46,
- "num_samples": 897,
- "tasks": [],
- "average_score": 0.1166739111764397
- },
- "Ethical and Safety Reasoning": {
- "count": 10,
- "num_samples": 170,
- "tasks": [],
- "average_score": 0.2774436090225564
- },
- "Language Understanding and Generation": {
- "count": 102,
- "num_samples": 1713,
- "tasks": [],
- "average_score": 0.1724799353848912
- },
- "Mathematical and Logical Reasoning": {
- "count": 91,
- "num_samples": 1630,
- "tasks": [],
- "average_score": 0.1275512898313342
- },
- "Object Recognition and Classification": {
- "count": 172,
- "num_samples": 2714,
- "tasks": [],
- "average_score": 0.12579260798514427
- },
- "Planning and Decision Making": {
- "count": 23,
- "num_samples": 356,
- "tasks": [],
- "average_score": 0.014803312629399587
- },
- "Scene and Event Understanding": {
- "count": 60,
- "num_samples": 1004,
- "tasks": [],
- "average_score": 0.15897902615904647
- },
- "Spatial and Temporal Reasoning": {
- "count": 78,
- "num_samples": 1273,
- "tasks": [],
- "average_score": 0.09276606649010487
- },
- "Text Recognition (OCR)": {
- "count": 101,
- "num_samples": 1687,
- "tasks": [],
- "average_score": 0.07893017100109866
- }
- }
- },
- "InternVL2_2B": {
- "app": {
- "Coding": {
- "count": 16,
- "num_samples": 244,
- "tasks": [],
- "average_score": 0.09082268323996265
- },
- "Information_Extraction": {
- "count": 41,
- "num_samples": 644,
- "tasks": [],
- "average_score": 0.03678177133215256
- },
- "Knowledge": {
- "count": 77,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.20753533217719797
- },
- "Mathematics": {
- "count": 30,
- "num_samples": 497,
- "tasks": [],
- "average_score": 0.12084183290294437
- },
- "Metrics": {
- "count": 3,
- "num_samples": 45,
- "tasks": [],
- "average_score": 0.3428571428571428
- },
- "Perception": {
- "count": 82,
- "num_samples": 1321,
- "tasks": [],
- "average_score": 0.19769593666817548
- },
- "Planning": {
- "count": 44,
- "num_samples": 714,
- "tasks": [],
- "average_score": 0.039950835968771276
- },
- "Science": {
- "count": 22,
- "num_samples": 469,
- "tasks": [],
- "average_score": 0.15289272275533383
- }
- },
- "input_format": {
- "3D Models and Aerial Imagery": {
- "count": 2,
- "num_samples": 30,
- "tasks": [],
- "average_score": 0.07184873949579831
- },
- "Artistic and Creative Content": {
- "count": 22,
- "num_samples": 389,
- "tasks": [],
- "average_score": 0.18693717087010792
- },
- "Diagrams and Data Visualizations": {
- "count": 88,
- "num_samples": 1524,
- "tasks": [],
- "average_score": 0.15159509081542988
- },
- "Photographs": {
- "count": 83,
- "num_samples": 1315,
- "tasks": [],
- "average_score": 0.22923075081716637
- },
- "Text-Based Images and Documents": {
- "count": 53,
- "num_samples": 847,
- "tasks": [],
- "average_score": 0.09447908809124074
- },
- "User Interface Screenshots": {
- "count": 67,
- "num_samples": 1123,
- "tasks": [],
- "average_score": 0.049217594376760605
- }
- },
- "input_num": {
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.14262795568189013
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 63,
- "num_samples": 975,
- "tasks": [],
- "average_score": 0.12369372450210245
- },
- "exact_text": {
- "count": 57,
- "num_samples": 880,
- "tasks": [],
- "average_score": 0.11544832152620972
- },
- "multiple_choice": {
- "count": 33,
- "num_samples": 567,
- "tasks": [],
- "average_score": 0.3044601862783681
- },
- "numerical_data": {
- "count": 39,
- "num_samples": 694,
- "tasks": [],
- "average_score": 0.12291071957107838
- },
- "open_ended_output": {
- "count": 51,
- "num_samples": 991,
- "tasks": [],
- "average_score": 0.24746476545671045
- },
- "structured_output": {
- "count": 72,
- "num_samples": 1121,
- "tasks": [],
- "average_score": 0.042960275283590164
- }
- },
- "skills": {
- "Commonsense and Social Reasoning": {
- "count": 38,
- "num_samples": 654,
- "tasks": [],
- "average_score": 0.3035836752792625
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 46,
- "num_samples": 897,
- "tasks": [],
- "average_score": 0.08201891993308255
- },
- "Ethical and Safety Reasoning": {
- "count": 10,
- "num_samples": 170,
- "tasks": [],
- "average_score": 0.4728533834586467
- },
- "Language Understanding and Generation": {
- "count": 102,
- "num_samples": 1713,
- "tasks": [],
- "average_score": 0.1905261989833371
- },
- "Mathematical and Logical Reasoning": {
- "count": 91,
- "num_samples": 1630,
- "tasks": [],
- "average_score": 0.1336101595652968
- },
- "Object Recognition and Classification": {
- "count": 172,
- "num_samples": 2714,
- "tasks": [],
- "average_score": 0.13333012698269087
- },
- "Planning and Decision Making": {
- "count": 23,
- "num_samples": 356,
- "tasks": [],
- "average_score": 0.013664596273291925
- },
- "Scene and Event Understanding": {
- "count": 60,
- "num_samples": 1004,
- "tasks": [],
- "average_score": 0.23055380602943532
- },
- "Spatial and Temporal Reasoning": {
- "count": 78,
- "num_samples": 1273,
- "tasks": [],
- "average_score": 0.11985812372011641
- },
- "Text Recognition (OCR)": {
- "count": 101,
- "num_samples": 1687,
- "tasks": [],
- "average_score": 0.12376971454228163
- }
- }
- },
- "InternVL2_76B": {
- "app": {
- "Coding": {
- "count": 16,
- "num_samples": 244,
- "tasks": [],
- "average_score": 0.42624956232493
- },
- "Information_Extraction": {
- "count": 41,
- "num_samples": 644,
- "tasks": [],
- "average_score": 0.4585598678029664
- },
- "Knowledge": {
- "count": 77,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.47251288369387245
- },
- "Mathematics": {
- "count": 30,
- "num_samples": 497,
- "tasks": [],
- "average_score": 0.3075073077960568
- },
- "Metrics": {
- "count": 3,
- "num_samples": 45,
- "tasks": [],
- "average_score": 0.5301587301587302
- },
- "Perception": {
- "count": 82,
- "num_samples": 1321,
- "tasks": [],
- "average_score": 0.5361401478255164
- },
- "Planning": {
- "count": 44,
- "num_samples": 714,
- "tasks": [],
- "average_score": 0.1619462380866451
- },
- "Science": {
- "count": 22,
- "num_samples": 469,
- "tasks": [],
- "average_score": 0.38874625564304305
- }
- },
- "input_format": {
- "3D Models and Aerial Imagery": {
- "count": 2,
- "num_samples": 30,
- "tasks": [],
- "average_score": 0.30169355252977215
- },
- "Artistic and Creative Content": {
- "count": 22,
- "num_samples": 389,
- "tasks": [],
- "average_score": 0.509332186292545
- },
- "Diagrams and Data Visualizations": {
- "count": 88,
- "num_samples": 1524,
- "tasks": [],
- "average_score": 0.39253566766026804
- },
- "Photographs": {
- "count": 83,
- "num_samples": 1315,
- "tasks": [],
- "average_score": 0.5065289649268628
- },
- "Text-Based Images and Documents": {
- "count": 53,
- "num_samples": 847,
- "tasks": [],
- "average_score": 0.3333759749379774
- },
- "User Interface Screenshots": {
- "count": 67,
- "num_samples": 1123,
- "tasks": [],
- "average_score": 0.39401514252711556
- }
- },
- "input_num": {
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.4205132675160581
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 63,
- "num_samples": 975,
- "tasks": [],
- "average_score": 0.3863929410693585
- },
- "exact_text": {
- "count": 57,
- "num_samples": 880,
- "tasks": [],
- "average_score": 0.4041893680050902
- },
- "multiple_choice": {
- "count": 33,
- "num_samples": 567,
- "tasks": [],
- "average_score": 0.5389260571078752
- },
- "numerical_data": {
- "count": 39,
- "num_samples": 694,
- "tasks": [],
- "average_score": 0.34950523809271744
- },
- "open_ended_output": {
- "count": 51,
- "num_samples": 991,
- "tasks": [],
- "average_score": 0.48322911874283003
- },
- "structured_output": {
- "count": 72,
- "num_samples": 1121,
- "tasks": [],
- "average_score": 0.4030580663588658
- }
- },
- "skills": {
- "Commonsense and Social Reasoning": {
- "count": 38,
- "num_samples": 654,
- "tasks": [],
- "average_score": 0.5873606708191794
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 46,
- "num_samples": 897,
- "tasks": [],
- "average_score": 0.37110860027855824
- },
- "Ethical and Safety Reasoning": {
- "count": 10,
- "num_samples": 170,
- "tasks": [],
- "average_score": 0.7041804511278196
- },
- "Language Understanding and Generation": {
- "count": 102,
- "num_samples": 1713,
- "tasks": [],
- "average_score": 0.470239452171767
- },
- "Mathematical and Logical Reasoning": {
- "count": 91,
- "num_samples": 1630,
- "tasks": [],
- "average_score": 0.3413715846680563
- },
- "Object Recognition and Classification": {
- "count": 172,
- "num_samples": 2714,
- "tasks": [],
- "average_score": 0.4230856844269695
- },
- "Planning and Decision Making": {
- "count": 23,
- "num_samples": 356,
- "tasks": [],
- "average_score": 0.10153556963007855
- },
- "Scene and Event Understanding": {
- "count": 60,
- "num_samples": 1004,
- "tasks": [],
- "average_score": 0.570666577587141
- },
- "Spatial and Temporal Reasoning": {
- "count": 78,
- "num_samples": 1273,
- "tasks": [],
- "average_score": 0.3276283897777921
- },
- "Text Recognition (OCR)": {
- "count": 101,
- "num_samples": 1687,
- "tasks": [],
- "average_score": 0.4672429826553732
- }
- }
- },
- "InternVL2_8B": {
- "app": {
- "Coding": {
- "count": 16,
- "num_samples": 244,
- "tasks": [],
- "average_score": 0.34736300770308126
- },
- "Information_Extraction": {
- "count": 41,
- "num_samples": 644,
- "tasks": [],
- "average_score": 0.25646898023629483
- },
- "Knowledge": {
- "count": 77,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.34366199611891174
- },
- "Mathematics": {
- "count": 30,
- "num_samples": 497,
- "tasks": [],
- "average_score": 0.23531351908862871
- },
- "Metrics": {
- "count": 3,
- "num_samples": 45,
- "tasks": [],
- "average_score": 0.3253968253968254
- },
- "Perception": {
- "count": 82,
- "num_samples": 1321,
- "tasks": [],
- "average_score": 0.3784296942438538
- },
- "Planning": {
- "count": 44,
- "num_samples": 714,
- "tasks": [],
- "average_score": 0.09134825639389237
- },
- "Science": {
- "count": 22,
- "num_samples": 469,
- "tasks": [],
- "average_score": 0.2912783917684807
- }
- },
- "input_format": {
- "3D Models and Aerial Imagery": {
- "count": 2,
- "num_samples": 30,
- "tasks": [],
- "average_score": 0.0503849634147267
- },
- "Artistic and Creative Content": {
- "count": 22,
- "num_samples": 389,
- "tasks": [],
- "average_score": 0.34383350461121587
- },
- "Diagrams and Data Visualizations": {
- "count": 88,
- "num_samples": 1524,
- "tasks": [],
- "average_score": 0.27187498646061353
- },
- "Photographs": {
- "count": 83,
- "num_samples": 1315,
- "tasks": [],
- "average_score": 0.4088467630174509
- },
- "Text-Based Images and Documents": {
- "count": 53,
- "num_samples": 847,
- "tasks": [],
- "average_score": 0.21516421271234623
- },
- "User Interface Screenshots": {
- "count": 67,
- "num_samples": 1123,
- "tasks": [],
- "average_score": 0.22539102164423624
- }
- },
- "input_num": {
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.29215647267040246
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 63,
- "num_samples": 975,
- "tasks": [],
- "average_score": 0.25281668404704594
- },
- "exact_text": {
- "count": 57,
- "num_samples": 880,
- "tasks": [],
- "average_score": 0.2452385560845516
- },
- "multiple_choice": {
- "count": 33,
- "num_samples": 567,
- "tasks": [],
- "average_score": 0.4334863789409244
- },
- "numerical_data": {
- "count": 39,
- "num_samples": 694,
- "tasks": [],
- "average_score": 0.26248166960198344
- },
- "open_ended_output": {
- "count": 51,
- "num_samples": 991,
- "tasks": [],
- "average_score": 0.3417106670258814
- },
- "structured_output": {
- "count": 72,
- "num_samples": 1121,
- "tasks": [],
- "average_score": 0.27991889529924496
- }
- },
- "skills": {
- "Commonsense and Social Reasoning": {
- "count": 38,
- "num_samples": 654,
- "tasks": [],
- "average_score": 0.4403771552269444
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 46,
- "num_samples": 897,
- "tasks": [],
- "average_score": 0.27396131593770284
- },
- "Ethical and Safety Reasoning": {
- "count": 10,
- "num_samples": 170,
- "tasks": [],
- "average_score": 0.6521729323308272
- },
- "Language Understanding and Generation": {
- "count": 102,
- "num_samples": 1713,
- "tasks": [],
- "average_score": 0.3284779417766259
- },
- "Mathematical and Logical Reasoning": {
- "count": 91,
- "num_samples": 1630,
- "tasks": [],
- "average_score": 0.24983605813271914
- },
- "Object Recognition and Classification": {
- "count": 172,
- "num_samples": 2714,
- "tasks": [],
- "average_score": 0.2915702951202482
- },
- "Planning and Decision Making": {
- "count": 23,
- "num_samples": 356,
- "tasks": [],
- "average_score": 0.0592961015994038
- },
- "Scene and Event Understanding": {
- "count": 60,
- "num_samples": 1004,
- "tasks": [],
- "average_score": 0.41603267498315427
- },
- "Spatial and Temporal Reasoning": {
- "count": 78,
- "num_samples": 1273,
- "tasks": [],
- "average_score": 0.21701915158341967
- },
- "Text Recognition (OCR)": {
- "count": 101,
- "num_samples": 1687,
- "tasks": [],
- "average_score": 0.30220279568886643
- }
- }
- },
- "Llama_3_2_11B": {
- "app": {
- "Coding": {
- "count": 16,
- "num_samples": 244,
- "tasks": [],
- "average_score": 0.14131944444444444
- },
- "Information_Extraction": {
- "count": 41,
- "num_samples": 644,
- "tasks": [],
- "average_score": 0.23423754995839735
- },
- "Knowledge": {
- "count": 77,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.33493936008655223
- },
- "Mathematics": {
- "count": 30,
- "num_samples": 497,
- "tasks": [],
- "average_score": 0.12719796356144183
- },
- "Metrics": {
- "count": 3,
- "num_samples": 45,
- "tasks": [],
- "average_score": 0.22857142857142856
- },
- "Perception": {
- "count": 82,
- "num_samples": 1321,
- "tasks": [],
- "average_score": 0.2740778723883188
- },
- "Planning": {
- "count": 44,
- "num_samples": 714,
- "tasks": [],
- "average_score": 0.09595984705908096
- },
- "Science": {
- "count": 22,
- "num_samples": 469,
- "tasks": [],
- "average_score": 0.18716549835825297
- }
- },
- "input_format": {
- "3D Models and Aerial Imagery": {
- "count": 2,
- "num_samples": 30,
- "tasks": [],
- "average_score": 0.14822411270107955
- },
- "Artistic and Creative Content": {
- "count": 22,
- "num_samples": 389,
- "tasks": [],
- "average_score": 0.3275861238187186
- },
- "Diagrams and Data Visualizations": {
- "count": 88,
- "num_samples": 1524,
- "tasks": [],
- "average_score": 0.1970899659349296
- },
- "Photographs": {
- "count": 83,
- "num_samples": 1315,
- "tasks": [],
- "average_score": 0.3387317156024255
- },
- "Text-Based Images and Documents": {
- "count": 53,
- "num_samples": 847,
- "tasks": [],
- "average_score": 0.13775107230512224
- },
- "User Interface Screenshots": {
- "count": 67,
- "num_samples": 1123,
- "tasks": [],
- "average_score": 0.18967604731477847
- }
- },
- "input_num": {
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.23165426777444673
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 63,
- "num_samples": 975,
- "tasks": [],
- "average_score": 0.15123880546660726
- },
- "exact_text": {
- "count": 57,
- "num_samples": 880,
- "tasks": [],
- "average_score": 0.16571305203663964
- },
- "multiple_choice": {
- "count": 33,
- "num_samples": 567,
- "tasks": [],
- "average_score": 0.3762691853600945
- },
- "numerical_data": {
- "count": 39,
- "num_samples": 694,
- "tasks": [],
- "average_score": 0.16301171403498463
- },
- "open_ended_output": {
- "count": 51,
- "num_samples": 991,
- "tasks": [],
- "average_score": 0.34463240030392384
- },
- "structured_output": {
- "count": 72,
- "num_samples": 1121,
- "tasks": [],
- "average_score": 0.24509462859331077
- }
- },
- "skills": {
- "Commonsense and Social Reasoning": {
- "count": 38,
- "num_samples": 654,
- "tasks": [],
- "average_score": 0.39649168256429074
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 46,
- "num_samples": 897,
- "tasks": [],
- "average_score": 0.21893599730050764
- },
- "Ethical and Safety Reasoning": {
- "count": 10,
- "num_samples": 170,
- "tasks": [],
- "average_score": 0.5728796992481204
- },
- "Language Understanding and Generation": {
- "count": 102,
- "num_samples": 1713,
- "tasks": [],
- "average_score": 0.25994005315432245
- },
- "Mathematical and Logical Reasoning": {
- "count": 91,
- "num_samples": 1630,
- "tasks": [],
- "average_score": 0.14653430680774066
- },
- "Object Recognition and Classification": {
- "count": 172,
- "num_samples": 2714,
- "tasks": [],
- "average_score": 0.2546845731733449
- },
- "Planning and Decision Making": {
- "count": 23,
- "num_samples": 356,
- "tasks": [],
- "average_score": 0.058403715092363084
- },
- "Scene and Event Understanding": {
- "count": 60,
- "num_samples": 1004,
- "tasks": [],
- "average_score": 0.37246318118748967
- },
- "Spatial and Temporal Reasoning": {
- "count": 78,
- "num_samples": 1273,
- "tasks": [],
- "average_score": 0.15806381880426276
- },
- "Text Recognition (OCR)": {
- "count": 101,
- "num_samples": 1687,
- "tasks": [],
- "average_score": 0.20716804318138016
- }
- }
- },
- "MiniCPM_v2.6": {
- "app": {
- "Coding": {
- "count": 16,
- "num_samples": 244,
- "tasks": [],
- "average_score": 0.20497125933706817
- },
- "Information_Extraction": {
- "count": 41,
- "num_samples": 644,
- "tasks": [],
- "average_score": 0.21340553041678637
- },
- "Knowledge": {
- "count": 77,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.33417132133610217
- },
- "Mathematics": {
- "count": 30,
- "num_samples": 497,
- "tasks": [],
- "average_score": 0.14556723677922526
- },
- "Metrics": {
- "count": 3,
- "num_samples": 45,
- "tasks": [],
- "average_score": 0.3507936507936508
- },
- "Perception": {
- "count": 82,
- "num_samples": 1321,
- "tasks": [],
- "average_score": 0.3620762837308124
- },
- "Planning": {
- "count": 44,
- "num_samples": 714,
- "tasks": [],
- "average_score": 0.07517089101065139
- },
- "Science": {
- "count": 22,
- "num_samples": 469,
- "tasks": [],
- "average_score": 0.25260048981169975
- }
- },
- "input_format": {
- "3D Models and Aerial Imagery": {
- "count": 2,
- "num_samples": 30,
- "tasks": [],
- "average_score": 0.012567281814686655
- },
- "Artistic and Creative Content": {
- "count": 22,
- "num_samples": 389,
- "tasks": [],
- "average_score": 0.34994481629202306
- },
- "Diagrams and Data Visualizations": {
- "count": 88,
- "num_samples": 1524,
- "tasks": [],
- "average_score": 0.23021362338817897
- },
- "Photographs": {
- "count": 83,
- "num_samples": 1315,
- "tasks": [],
- "average_score": 0.3681846956052881
- },
- "Text-Based Images and Documents": {
- "count": 53,
- "num_samples": 847,
- "tasks": [],
- "average_score": 0.17128318830807052
- },
- "User Interface Screenshots": {
- "count": 67,
- "num_samples": 1123,
- "tasks": [],
- "average_score": 0.21066692306852683
- }
- },
- "input_num": {
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.25947537124244935
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 63,
- "num_samples": 975,
- "tasks": [],
- "average_score": 0.23679437883858215
- },
- "exact_text": {
- "count": 57,
- "num_samples": 880,
- "tasks": [],
- "average_score": 0.21540007432647457
- },
- "multiple_choice": {
- "count": 33,
- "num_samples": 567,
- "tasks": [],
- "average_score": 0.39586776859504136
- },
- "numerical_data": {
- "count": 39,
- "num_samples": 694,
- "tasks": [],
- "average_score": 0.2036075191422558
- },
- "open_ended_output": {
- "count": 51,
- "num_samples": 991,
- "tasks": [],
- "average_score": 0.3711731498662282
- },
- "structured_output": {
- "count": 72,
- "num_samples": 1121,
- "tasks": [],
- "average_score": 0.20284349423013687
- }
- },
- "skills": {
- "Commonsense and Social Reasoning": {
- "count": 38,
- "num_samples": 654,
- "tasks": [],
- "average_score": 0.45156722842924535
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 46,
- "num_samples": 897,
- "tasks": [],
- "average_score": 0.2244713686485571
- },
- "Ethical and Safety Reasoning": {
- "count": 10,
- "num_samples": 170,
- "tasks": [],
- "average_score": 0.660718045112782
- },
- "Language Understanding and Generation": {
- "count": 102,
- "num_samples": 1713,
- "tasks": [],
- "average_score": 0.3045977370408878
- },
- "Mathematical and Logical Reasoning": {
- "count": 91,
- "num_samples": 1630,
- "tasks": [],
- "average_score": 0.18352505380246076
- },
- "Object Recognition and Classification": {
- "count": 172,
- "num_samples": 2714,
- "tasks": [],
- "average_score": 0.2657183000752527
- },
- "Planning and Decision Making": {
- "count": 23,
- "num_samples": 356,
- "tasks": [],
- "average_score": 0.06087615859328559
- },
- "Scene and Event Understanding": {
- "count": 60,
- "num_samples": 1004,
- "tasks": [],
- "average_score": 0.3977302205205499
- },
- "Spatial and Temporal Reasoning": {
- "count": 78,
- "num_samples": 1273,
- "tasks": [],
- "average_score": 0.17375496033997198
- },
- "Text Recognition (OCR)": {
- "count": 101,
- "num_samples": 1687,
- "tasks": [],
- "average_score": 0.26814713591233313
- }
- }
- },
- "Molmo_72B": {
- "app": {
- "Coding": {
- "count": 16,
- "num_samples": 244,
- "tasks": [],
- "average_score": 0.2582151610644257
- },
- "Information_Extraction": {
- "count": 41,
- "num_samples": 644,
- "tasks": [],
- "average_score": 0.5042591723808818
- },
- "Knowledge": {
- "count": 77,
- "num_samples": 1279,
- "tasks": [],
- "average_score": 0.39648868632862583
- },
- "Mathematics": {
- "count": 30,
- "num_samples": 497,
- "tasks": [],
- "average_score": 0.2954490282663994
- },
- "Metrics": {
- "count": 3,
- "num_samples": 45,
- "tasks": [],
- "average_score": 0.6714285714285714
- },
- "Perception": {
- "count": 82,
- "num_samples": 1306,
- "tasks": [],
- "average_score": 0.4704848349431393
- },
- "Planning": {
- "count": 44,
- "num_samples": 698,
- "tasks": [],
- "average_score": 0.13015529062282669
- },
- "Science": {
- "count": 22,
- "num_samples": 469,
- "tasks": [],
- "average_score": 0.3557374102316002
- }
- },
- "input_format": {
- "3D Models and Aerial Imagery": {
- "count": 2,
- "num_samples": 30,
- "tasks": [],
- "average_score": 0.18757766329699532
- },
- "Artistic and Creative Content": {
- "count": 22,
- "num_samples": 389,
- "tasks": [],
- "average_score": 0.4405271103381682
- },
- "Diagrams and Data Visualizations": {
- "count": 88,
- "num_samples": 1509,
- "tasks": [],
- "average_score": 0.35176591065677537
- },
- "Photographs": {
- "count": 83,
- "num_samples": 1300,
- "tasks": [],
- "average_score": 0.47052754190598134
- },
- "Text-Based Images and Documents": {
- "count": 53,
- "num_samples": 847,
- "tasks": [],
- "average_score": 0.24743187516175363
- },
- "User Interface Screenshots": {
- "count": 67,
- "num_samples": 1107,
- "tasks": [],
- "average_score": 0.3754692399771127
- }
- },
- "input_num": {
- "1-image": {
- "count": 315,
- "num_samples": 5182,
- "tasks": [],
- "average_score": 0.3757024328002091
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 63,
- "num_samples": 975,
- "tasks": [],
- "average_score": 0.3048441329189725
- },
- "exact_text": {
- "count": 57,
- "num_samples": 880,
- "tasks": [],
- "average_score": 0.36443166533642163
- },
- "multiple_choice": {
- "count": 33,
- "num_samples": 552,
- "tasks": [],
- "average_score": 0.5421225239407056
- },
- "numerical_data": {
- "count": 39,
- "num_samples": 679,
- "tasks": [],
- "average_score": 0.3342330361070466
- },
- "open_ended_output": {
- "count": 51,
- "num_samples": 991,
- "tasks": [],
- "average_score": 0.4120820025247545
- },
- "structured_output": {
- "count": 72,
- "num_samples": 1105,
- "tasks": [],
- "average_score": 0.3670439889863054
- }
- },
- "skills": {
- "Commonsense and Social Reasoning": {
- "count": 38,
- "num_samples": 639,
- "tasks": [],
- "average_score": 0.445412976139552
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 46,
- "num_samples": 897,
- "tasks": [],
- "average_score": 0.3070615049173117
- },
- "Ethical and Safety Reasoning": {
- "count": 10,
- "num_samples": 155,
- "tasks": [],
- "average_score": 0.5953120300751881
- },
- "Language Understanding and Generation": {
- "count": 102,
- "num_samples": 1698,
- "tasks": [],
- "average_score": 0.4110431137367615
- },
- "Mathematical and Logical Reasoning": {
- "count": 91,
- "num_samples": 1615,
- "tasks": [],
- "average_score": 0.2983397150768741
- },
- "Object Recognition and Classification": {
- "count": 172,
- "num_samples": 2714,
- "tasks": [],
- "average_score": 0.4223762317042425
- },
- "Planning and Decision Making": {
- "count": 23,
- "num_samples": 340,
- "tasks": [],
- "average_score": 0.07825953913967484
- },
- "Scene and Event Understanding": {
- "count": 60,
- "num_samples": 1004,
- "tasks": [],
- "average_score": 0.5756984198310193
- },
- "Spatial and Temporal Reasoning": {
- "count": 78,
- "num_samples": 1257,
- "tasks": [],
- "average_score": 0.29197652844726363
- },
- "Text Recognition (OCR)": {
- "count": 101,
- "num_samples": 1672,
- "tasks": [],
- "average_score": 0.41462128751753047
- }
- }
- },
- "Molmo_7B_D": {
- "app": {
- "Coding": {
- "count": 16,
- "num_samples": 244,
- "tasks": [],
- "average_score": 0.1158110119047619
- },
- "Information_Extraction": {
- "count": 41,
- "num_samples": 644,
- "tasks": [],
- "average_score": 0.27184002856754413
- },
- "Knowledge": {
- "count": 77,
- "num_samples": 1279,
- "tasks": [],
- "average_score": 0.2787344822161389
- },
- "Mathematics": {
- "count": 30,
- "num_samples": 497,
- "tasks": [],
- "average_score": 0.1740048655548875
- },
- "Metrics": {
- "count": 3,
- "num_samples": 45,
- "tasks": [],
- "average_score": 0.3619047619047619
- },
- "Perception": {
- "count": 82,
- "num_samples": 1321,
- "tasks": [],
- "average_score": 0.30311570603428784
- },
- "Planning": {
- "count": 44,
- "num_samples": 714,
- "tasks": [],
- "average_score": 0.06424366688759706
- },
- "Science": {
- "count": 22,
- "num_samples": 469,
- "tasks": [],
- "average_score": 0.173722800705029
- }
- },
- "input_format": {
- "3D Models and Aerial Imagery": {
- "count": 2,
- "num_samples": 30,
- "tasks": [],
- "average_score": 0.09043432702433757
- },
- "Artistic and Creative Content": {
- "count": 22,
- "num_samples": 389,
- "tasks": [],
- "average_score": 0.3106093738160722
- },
- "Diagrams and Data Visualizations": {
- "count": 88,
- "num_samples": 1524,
- "tasks": [],
- "average_score": 0.21356852314768052
- },
- "Photographs": {
- "count": 83,
- "num_samples": 1300,
- "tasks": [],
- "average_score": 0.323282724310645
- },
- "Text-Based Images and Documents": {
- "count": 53,
- "num_samples": 847,
- "tasks": [],
- "average_score": 0.1327652104313917
- },
- "User Interface Screenshots": {
- "count": 67,
- "num_samples": 1123,
- "tasks": [],
- "average_score": 0.18796442406686825
- }
- },
- "input_num": {
- "1-image": {
- "count": 315,
- "num_samples": 5213,
- "tasks": [],
- "average_score": 0.22943156697817663
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 63,
- "num_samples": 975,
- "tasks": [],
- "average_score": 0.17305260714177756
- },
- "exact_text": {
- "count": 57,
- "num_samples": 880,
- "tasks": [],
- "average_score": 0.17907829453546903
- },
- "multiple_choice": {
- "count": 33,
- "num_samples": 552,
- "tasks": [],
- "average_score": 0.3169618260527351
- },
- "numerical_data": {
- "count": 39,
- "num_samples": 694,
- "tasks": [],
- "average_score": 0.22086240998395923
- },
- "open_ended_output": {
- "count": 51,
- "num_samples": 991,
- "tasks": [],
- "average_score": 0.324079404512755
- },
- "structured_output": {
- "count": 72,
- "num_samples": 1121,
- "tasks": [],
- "average_score": 0.21610753722787088
- }
- },
- "skills": {
- "Commonsense and Social Reasoning": {
- "count": 38,
- "num_samples": 639,
- "tasks": [],
- "average_score": 0.32356781790614975
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 46,
- "num_samples": 897,
- "tasks": [],
- "average_score": 0.19244928377978027
- },
- "Ethical and Safety Reasoning": {
- "count": 10,
- "num_samples": 155,
- "tasks": [],
- "average_score": 0.4433947368421053
- },
- "Language Understanding and Generation": {
- "count": 102,
- "num_samples": 1698,
- "tasks": [],
- "average_score": 0.25685172601108597
- },
- "Mathematical and Logical Reasoning": {
- "count": 91,
- "num_samples": 1630,
- "tasks": [],
- "average_score": 0.17259103199957743
- },
- "Object Recognition and Classification": {
- "count": 172,
- "num_samples": 2714,
- "tasks": [],
- "average_score": 0.24958564675030656
- },
- "Planning and Decision Making": {
- "count": 23,
- "num_samples": 356,
- "tasks": [],
- "average_score": 0.035588894400059294
- },
- "Scene and Event Understanding": {
- "count": 60,
- "num_samples": 1004,
- "tasks": [],
- "average_score": 0.35830528296805764
- },
- "Spatial and Temporal Reasoning": {
- "count": 78,
- "num_samples": 1273,
- "tasks": [],
- "average_score": 0.1939605648275455
- },
- "Text Recognition (OCR)": {
- "count": 101,
- "num_samples": 1687,
- "tasks": [],
- "average_score": 0.2239160707791646
- }
- }
- },
- "NVLM": {
- "app": {
- "Coding": {
- "count": 16,
- "num_samples": 244,
- "tasks": [],
- "average_score": 0.37153871965452856
- },
- "Information_Extraction": {
- "count": 41,
- "num_samples": 644,
- "tasks": [],
- "average_score": 0.352859881186271
- },
- "Knowledge": {
- "count": 77,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.37572531212341936
- },
- "Mathematics": {
- "count": 30,
- "num_samples": 497,
- "tasks": [],
- "average_score": 0.2786818799518423
- },
- "Metrics": {
- "count": 3,
- "num_samples": 45,
- "tasks": [],
- "average_score": 0.3047619047619048
- },
- "Perception": {
- "count": 82,
- "num_samples": 1321,
- "tasks": [],
- "average_score": 0.45079588183469294
- },
- "Planning": {
- "count": 44,
- "num_samples": 714,
- "tasks": [],
- "average_score": 0.1252138046141793
- },
- "Science": {
- "count": 22,
- "num_samples": 469,
- "tasks": [],
- "average_score": 0.3518857602487131
- }
- },
- "input_format": {
- "3D Models and Aerial Imagery": {
- "count": 2,
- "num_samples": 30,
- "tasks": [],
- "average_score": 0.09447890526012262
- },
- "Artistic and Creative Content": {
- "count": 22,
- "num_samples": 389,
- "tasks": [],
- "average_score": 0.4387718807206103
- },
- "Diagrams and Data Visualizations": {
- "count": 88,
- "num_samples": 1524,
- "tasks": [],
- "average_score": 0.32094439294995036
- },
- "Photographs": {
- "count": 83,
- "num_samples": 1315,
- "tasks": [],
- "average_score": 0.4332099707344069
- },
- "Text-Based Images and Documents": {
- "count": 53,
- "num_samples": 847,
- "tasks": [],
- "average_score": 0.30070480033875985
- },
- "User Interface Screenshots": {
- "count": 67,
- "num_samples": 1123,
- "tasks": [],
- "average_score": 0.2814148428882822
- }
- },
- "input_num": {
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.345503562629823
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 63,
- "num_samples": 975,
- "tasks": [],
- "average_score": 0.3215154320779893
- },
- "exact_text": {
- "count": 57,
- "num_samples": 880,
- "tasks": [],
- "average_score": 0.29287492253780084
- },
- "multiple_choice": {
- "count": 33,
- "num_samples": 567,
- "tasks": [],
- "average_score": 0.5016004197822379
- },
- "numerical_data": {
- "count": 39,
- "num_samples": 694,
- "tasks": [],
- "average_score": 0.28793758479482745
- },
- "open_ended_output": {
- "count": 51,
- "num_samples": 991,
- "tasks": [],
- "average_score": 0.3828322321439372
- },
- "structured_output": {
- "count": 72,
- "num_samples": 1121,
- "tasks": [],
- "average_score": 0.34135355449546323
- }
- },
- "skills": {
- "Commonsense and Social Reasoning": {
- "count": 38,
- "num_samples": 654,
- "tasks": [],
- "average_score": 0.45915496990325566
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 46,
- "num_samples": 897,
- "tasks": [],
- "average_score": 0.3152573721587561
- },
- "Ethical and Safety Reasoning": {
- "count": 10,
- "num_samples": 170,
- "tasks": [],
- "average_score": 0.6521954887218044
- },
- "Language Understanding and Generation": {
- "count": 102,
- "num_samples": 1713,
- "tasks": [],
- "average_score": 0.38986101015677044
- },
- "Mathematical and Logical Reasoning": {
- "count": 91,
- "num_samples": 1630,
- "tasks": [],
- "average_score": 0.30043411704099793
- },
- "Object Recognition and Classification": {
- "count": 172,
- "num_samples": 2714,
- "tasks": [],
- "average_score": 0.3359094293956291
- },
- "Planning and Decision Making": {
- "count": 23,
- "num_samples": 356,
- "tasks": [],
- "average_score": 0.07615011020932495
- },
- "Scene and Event Understanding": {
- "count": 60,
- "num_samples": 1004,
- "tasks": [],
- "average_score": 0.46386896656934745
- },
- "Spatial and Temporal Reasoning": {
- "count": 78,
- "num_samples": 1273,
- "tasks": [],
- "average_score": 0.26907670581189963
- },
- "Text Recognition (OCR)": {
- "count": 101,
- "num_samples": 1687,
- "tasks": [],
- "average_score": 0.3943476764428869
- }
- }
- },
- "POINTS_15_7B": {
- "app": {
- "Coding": {
- "count": 16,
- "num_samples": 244,
- "tasks": [],
- "average_score": 0.31641062675070025
- },
- "Information_Extraction": {
- "count": 41,
- "num_samples": 644,
- "tasks": [],
- "average_score": 0.3095789895735217
- },
- "Knowledge": {
- "count": 77,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.35705988992418164
- },
- "Mathematics": {
- "count": 30,
- "num_samples": 497,
- "tasks": [],
- "average_score": 0.24128406446063128
- },
- "Metrics": {
- "count": 3,
- "num_samples": 45,
- "tasks": [],
- "average_score": 0.48095238095238096
- },
- "Perception": {
- "count": 82,
- "num_samples": 1321,
- "tasks": [],
- "average_score": 0.4420532221275683
- },
- "Planning": {
- "count": 44,
- "num_samples": 714,
- "tasks": [],
- "average_score": 0.1277481304284383
- },
- "Science": {
- "count": 22,
- "num_samples": 469,
- "tasks": [],
- "average_score": 0.32551503611448934
- }
- },
- "input_format": {
- "3D Models and Aerial Imagery": {
- "count": 2,
- "num_samples": 30,
- "tasks": [],
- "average_score": 0.15572486552610215
- },
- "Artistic and Creative Content": {
- "count": 22,
- "num_samples": 389,
- "tasks": [],
- "average_score": 0.37330010041194067
- },
- "Diagrams and Data Visualizations": {
- "count": 88,
- "num_samples": 1524,
- "tasks": [],
- "average_score": 0.30991539183635347
- },
- "Photographs": {
- "count": 83,
- "num_samples": 1315,
- "tasks": [],
- "average_score": 0.4276343385855984
- },
- "Text-Based Images and Documents": {
- "count": 53,
- "num_samples": 847,
- "tasks": [],
- "average_score": 0.24722440389191766
- },
- "User Interface Screenshots": {
- "count": 67,
- "num_samples": 1123,
- "tasks": [],
- "average_score": 0.27713077639707523
- }
- },
- "input_num": {
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.32686003793394974
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 63,
- "num_samples": 975,
- "tasks": [],
- "average_score": 0.3101162129247054
- },
- "exact_text": {
- "count": 57,
- "num_samples": 880,
- "tasks": [],
- "average_score": 0.2614010338203017
- },
- "multiple_choice": {
- "count": 33,
- "num_samples": 567,
- "tasks": [],
- "average_score": 0.4855568673750491
- },
- "numerical_data": {
- "count": 39,
- "num_samples": 694,
- "tasks": [],
- "average_score": 0.28761899055063767
- },
- "open_ended_output": {
- "count": 51,
- "num_samples": 991,
- "tasks": [],
- "average_score": 0.37619796536407
- },
- "structured_output": {
- "count": 72,
- "num_samples": 1121,
- "tasks": [],
- "average_score": 0.3069044183161335
- }
- },
- "skills": {
- "Commonsense and Social Reasoning": {
- "count": 38,
- "num_samples": 654,
- "tasks": [],
- "average_score": 0.45980379926019677
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 46,
- "num_samples": 897,
- "tasks": [],
- "average_score": 0.30711751050032277
- },
- "Ethical and Safety Reasoning": {
- "count": 10,
- "num_samples": 170,
- "tasks": [],
- "average_score": 0.6173496240601504
- },
- "Language Understanding and Generation": {
- "count": 102,
- "num_samples": 1713,
- "tasks": [],
- "average_score": 0.35317851821169477
- },
- "Mathematical and Logical Reasoning": {
- "count": 91,
- "num_samples": 1630,
- "tasks": [],
- "average_score": 0.28961632718794406
- },
- "Object Recognition and Classification": {
- "count": 172,
- "num_samples": 2714,
- "tasks": [],
- "average_score": 0.3333459246264911
- },
- "Planning and Decision Making": {
- "count": 23,
- "num_samples": 356,
- "tasks": [],
- "average_score": 0.08369131166291023
- },
- "Scene and Event Understanding": {
- "count": 60,
- "num_samples": 1004,
- "tasks": [],
- "average_score": 0.43105364189963935
- },
- "Spatial and Temporal Reasoning": {
- "count": 78,
- "num_samples": 1273,
- "tasks": [],
- "average_score": 0.26796963300870397
- },
- "Text Recognition (OCR)": {
- "count": 101,
- "num_samples": 1687,
- "tasks": [],
- "average_score": 0.3443899066327916
- }
- }
- },
- "POINTS_7B": {
- "app": {
- "Coding": {
- "count": 16,
- "num_samples": 244,
- "tasks": [],
- "average_score": 0.3151282387955181
- },
- "Information_Extraction": {
- "count": 41,
- "num_samples": 644,
- "tasks": [],
- "average_score": 0.22503259387671015
- },
- "Knowledge": {
- "count": 77,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.27361452525243724
- },
- "Mathematics": {
- "count": 30,
- "num_samples": 497,
- "tasks": [],
- "average_score": 0.19633555542091463
- },
- "Metrics": {
- "count": 3,
- "num_samples": 45,
- "tasks": [],
- "average_score": 0.34761904761904755
- },
- "Perception": {
- "count": 82,
- "num_samples": 1321,
- "tasks": [],
- "average_score": 0.3737263982731003
- },
- "Planning": {
- "count": 44,
- "num_samples": 714,
- "tasks": [],
- "average_score": 0.08476480516686397
- },
- "Science": {
- "count": 22,
- "num_samples": 469,
- "tasks": [],
- "average_score": 0.2606187882141402
- }
- },
- "input_format": {
- "3D Models and Aerial Imagery": {
- "count": 2,
- "num_samples": 30,
- "tasks": [],
- "average_score": 0.1499797713556708
- },
- "Artistic and Creative Content": {
- "count": 22,
- "num_samples": 389,
- "tasks": [],
- "average_score": 0.33916980654110634
- },
- "Diagrams and Data Visualizations": {
- "count": 88,
- "num_samples": 1524,
- "tasks": [],
- "average_score": 0.25684059763242745
- },
- "Photographs": {
- "count": 83,
- "num_samples": 1315,
- "tasks": [],
- "average_score": 0.3523684400745285
- },
- "Text-Based Images and Documents": {
- "count": 53,
- "num_samples": 847,
- "tasks": [],
- "average_score": 0.19332242733156837
- },
- "User Interface Screenshots": {
- "count": 67,
- "num_samples": 1123,
- "tasks": [],
- "average_score": 0.18689735511962233
- }
- },
- "input_num": {
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.2615189201461682
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 63,
- "num_samples": 975,
- "tasks": [],
- "average_score": 0.23004840221723208
- },
- "exact_text": {
- "count": 57,
- "num_samples": 880,
- "tasks": [],
- "average_score": 0.239982641771955
- },
- "multiple_choice": {
- "count": 33,
- "num_samples": 567,
- "tasks": [],
- "average_score": 0.4200183654729108
- },
- "numerical_data": {
- "count": 39,
- "num_samples": 694,
- "tasks": [],
- "average_score": 0.23646374895042882
- },
- "open_ended_output": {
- "count": 51,
- "num_samples": 991,
- "tasks": [],
- "average_score": 0.28263350209672056
- },
- "structured_output": {
- "count": 72,
- "num_samples": 1121,
- "tasks": [],
- "average_score": 0.2320749867881998
- }
- },
- "skills": {
- "Commonsense and Social Reasoning": {
- "count": 38,
- "num_samples": 654,
- "tasks": [],
- "average_score": 0.36827291874151846
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 46,
- "num_samples": 897,
- "tasks": [],
- "average_score": 0.21311917080615544
- },
- "Ethical and Safety Reasoning": {
- "count": 10,
- "num_samples": 170,
- "tasks": [],
- "average_score": 0.48204135338345855
- },
- "Language Understanding and Generation": {
- "count": 102,
- "num_samples": 1713,
- "tasks": [],
- "average_score": 0.2799740367463896
- },
- "Mathematical and Logical Reasoning": {
- "count": 91,
- "num_samples": 1630,
- "tasks": [],
- "average_score": 0.22387504020162652
- },
- "Object Recognition and Classification": {
- "count": 172,
- "num_samples": 2714,
- "tasks": [],
- "average_score": 0.27890902837062037
- },
- "Planning and Decision Making": {
- "count": 23,
- "num_samples": 356,
- "tasks": [],
- "average_score": 0.06502747891786666
- },
- "Scene and Event Understanding": {
- "count": 60,
- "num_samples": 1004,
- "tasks": [],
- "average_score": 0.37373928195086786
- },
- "Spatial and Temporal Reasoning": {
- "count": 78,
- "num_samples": 1273,
- "tasks": [],
- "average_score": 0.21857370538972226
- },
- "Text Recognition (OCR)": {
- "count": 101,
- "num_samples": 1687,
- "tasks": [],
- "average_score": 0.2684488868499041
- }
- }
- },
- "Phi-3.5-vision": {
- "app": {
- "Coding": {
- "count": 16,
- "num_samples": 244,
- "tasks": [],
- "average_score": 0.3150531045751634
- },
- "Information_Extraction": {
- "count": 41,
- "num_samples": 644,
- "tasks": [],
- "average_score": 0.18412184931451608
- },
- "Knowledge": {
- "count": 77,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.3374902354661273
- },
- "Mathematics": {
- "count": 30,
- "num_samples": 497,
- "tasks": [],
- "average_score": 0.19473774010136682
- },
- "Metrics": {
- "count": 3,
- "num_samples": 45,
- "tasks": [],
- "average_score": 0.4142857142857143
- },
- "Perception": {
- "count": 82,
- "num_samples": 1321,
- "tasks": [],
- "average_score": 0.38360617573843164
- },
- "Planning": {
- "count": 44,
- "num_samples": 714,
- "tasks": [],
- "average_score": 0.09254779551496593
- },
- "Science": {
- "count": 22,
- "num_samples": 469,
- "tasks": [],
- "average_score": 0.3034971430938622
- }
- },
- "input_format": {
- "3D Models and Aerial Imagery": {
- "count": 2,
- "num_samples": 30,
- "tasks": [],
- "average_score": 0.04423070234557053
- },
- "Artistic and Creative Content": {
- "count": 22,
- "num_samples": 389,
- "tasks": [],
- "average_score": 0.3249099963089235
- },
- "Diagrams and Data Visualizations": {
- "count": 88,
- "num_samples": 1524,
- "tasks": [],
- "average_score": 0.2797292831010349
- },
- "Photographs": {
- "count": 83,
- "num_samples": 1315,
- "tasks": [],
- "average_score": 0.4073649042468842
- },
- "Text-Based Images and Documents": {
- "count": 53,
- "num_samples": 847,
- "tasks": [],
- "average_score": 0.1852656532829957
- },
- "User Interface Screenshots": {
- "count": 67,
- "num_samples": 1123,
- "tasks": [],
- "average_score": 0.18482544209393917
- }
- },
- "input_num": {
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.2789407286767065
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 63,
- "num_samples": 975,
- "tasks": [],
- "average_score": 0.2141318618135909
- },
- "exact_text": {
- "count": 57,
- "num_samples": 880,
- "tasks": [],
- "average_score": 0.23002523914604356
- },
- "multiple_choice": {
- "count": 33,
- "num_samples": 567,
- "tasks": [],
- "average_score": 0.46076785167694245
- },
- "numerical_data": {
- "count": 39,
- "num_samples": 694,
- "tasks": [],
- "average_score": 0.20335546763980886
- },
- "open_ended_output": {
- "count": 51,
- "num_samples": 991,
- "tasks": [],
- "average_score": 0.38510487366381607
- },
- "structured_output": {
- "count": 72,
- "num_samples": 1121,
- "tasks": [],
- "average_score": 0.2567782320477167
- }
- },
- "skills": {
- "Commonsense and Social Reasoning": {
- "count": 38,
- "num_samples": 654,
- "tasks": [],
- "average_score": 0.44526176399160444
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 46,
- "num_samples": 897,
- "tasks": [],
- "average_score": 0.26422404318271525
- },
- "Ethical and Safety Reasoning": {
- "count": 10,
- "num_samples": 170,
- "tasks": [],
- "average_score": 0.6958045112781954
- },
- "Language Understanding and Generation": {
- "count": 102,
- "num_samples": 1713,
- "tasks": [],
- "average_score": 0.3097558922032538
- },
- "Mathematical and Logical Reasoning": {
- "count": 91,
- "num_samples": 1630,
- "tasks": [],
- "average_score": 0.22905610983444738
- },
- "Object Recognition and Classification": {
- "count": 172,
- "num_samples": 2714,
- "tasks": [],
- "average_score": 0.2845968124529633
- },
- "Planning and Decision Making": {
- "count": 23,
- "num_samples": 356,
- "tasks": [],
- "average_score": 0.08173397535709728
- },
- "Scene and Event Understanding": {
- "count": 60,
- "num_samples": 1004,
- "tasks": [],
- "average_score": 0.4299430434813172
- },
- "Spatial and Temporal Reasoning": {
- "count": 78,
- "num_samples": 1273,
- "tasks": [],
- "average_score": 0.21524515429041854
- },
- "Text Recognition (OCR)": {
- "count": 101,
- "num_samples": 1687,
- "tasks": [],
- "average_score": 0.2682909697086125
- }
- }
- },
- "Pixtral_12B": {
- "app": {
- "Coding": {
- "count": 16,
- "num_samples": 244,
- "tasks": [],
- "average_score": 0.3689221521942111
- },
- "Information_Extraction": {
- "count": 41,
- "num_samples": 644,
- "tasks": [],
- "average_score": 0.4143415072482432
- },
- "Knowledge": {
- "count": 77,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.37374171749764634
- },
- "Mathematics": {
- "count": 30,
- "num_samples": 497,
- "tasks": [],
- "average_score": 0.27839183583970506
- },
- "Metrics": {
- "count": 3,
- "num_samples": 45,
- "tasks": [],
- "average_score": 0.3444444444444444
- },
- "Perception": {
- "count": 82,
- "num_samples": 1321,
- "tasks": [],
- "average_score": 0.46377210154054166
- },
- "Planning": {
- "count": 44,
- "num_samples": 714,
- "tasks": [],
- "average_score": 0.122839038193565
- },
- "Science": {
- "count": 22,
- "num_samples": 469,
- "tasks": [],
- "average_score": 0.35876745089800455
- }
- },
- "input_format": {
- "3D Models and Aerial Imagery": {
- "count": 2,
- "num_samples": 30,
- "tasks": [],
- "average_score": 0.11048396896880823
- },
- "Artistic and Creative Content": {
- "count": 22,
- "num_samples": 389,
- "tasks": [],
- "average_score": 0.3947713702430871
- },
- "Diagrams and Data Visualizations": {
- "count": 88,
- "num_samples": 1524,
- "tasks": [],
- "average_score": 0.36461586731895695
- },
- "Photographs": {
- "count": 83,
- "num_samples": 1315,
- "tasks": [],
- "average_score": 0.4327891810625066
- },
- "Text-Based Images and Documents": {
- "count": 53,
- "num_samples": 847,
- "tasks": [],
- "average_score": 0.2688429906381188
- },
- "User Interface Screenshots": {
- "count": 67,
- "num_samples": 1123,
- "tasks": [],
- "average_score": 0.31669784888602887
- }
- },
- "input_num": {
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.3567653041737331
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 63,
- "num_samples": 975,
- "tasks": [],
- "average_score": 0.3161209026018942
- },
- "exact_text": {
- "count": 57,
- "num_samples": 880,
- "tasks": [],
- "average_score": 0.29510067482559116
- },
- "multiple_choice": {
- "count": 33,
- "num_samples": 567,
- "tasks": [],
- "average_score": 0.5076172985263894
- },
- "numerical_data": {
- "count": 39,
- "num_samples": 694,
- "tasks": [],
- "average_score": 0.3135393276021012
- },
- "open_ended_output": {
- "count": 51,
- "num_samples": 991,
- "tasks": [],
- "average_score": 0.3995518703501119
- },
- "structured_output": {
- "count": 72,
- "num_samples": 1121,
- "tasks": [],
- "average_score": 0.36511340930610364
- }
- },
- "skills": {
- "Commonsense and Social Reasoning": {
- "count": 38,
- "num_samples": 654,
- "tasks": [],
- "average_score": 0.4193828210432134
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 46,
- "num_samples": 897,
- "tasks": [],
- "average_score": 0.35085932465399283
- },
- "Ethical and Safety Reasoning": {
- "count": 10,
- "num_samples": 170,
- "tasks": [],
- "average_score": 0.6302142857142857
- },
- "Language Understanding and Generation": {
- "count": 102,
- "num_samples": 1713,
- "tasks": [],
- "average_score": 0.38842270268832113
- },
- "Mathematical and Logical Reasoning": {
- "count": 91,
- "num_samples": 1630,
- "tasks": [],
- "average_score": 0.3055711926752603
- },
- "Object Recognition and Classification": {
- "count": 172,
- "num_samples": 2714,
- "tasks": [],
- "average_score": 0.37359181974124417
- },
- "Planning and Decision Making": {
- "count": 23,
- "num_samples": 356,
- "tasks": [],
- "average_score": 0.08507904212012304
- },
- "Scene and Event Understanding": {
- "count": 60,
- "num_samples": 1004,
- "tasks": [],
- "average_score": 0.4677006268371793
- },
- "Spatial and Temporal Reasoning": {
- "count": 78,
- "num_samples": 1273,
- "tasks": [],
- "average_score": 0.28269833721806076
- },
- "Text Recognition (OCR)": {
- "count": 101,
- "num_samples": 1687,
- "tasks": [],
- "average_score": 0.39551360119171197
- }
- }
- },
- "Qwen2_VL_2B": {
- "app": {
- "Coding": {
- "count": 16,
- "num_samples": 244,
- "tasks": [],
- "average_score": 0.18075323879551822
- },
- "Information_Extraction": {
- "count": 41,
- "num_samples": 644,
- "tasks": [],
- "average_score": 0.21948696002702636
- },
- "Knowledge": {
- "count": 77,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.28841305815072016
- },
- "Mathematics": {
- "count": 30,
- "num_samples": 497,
- "tasks": [],
- "average_score": 0.16147424237969243
- },
- "Metrics": {
- "count": 3,
- "num_samples": 45,
- "tasks": [],
- "average_score": 0.36984126984126986
- },
- "Perception": {
- "count": 82,
- "num_samples": 1321,
- "tasks": [],
- "average_score": 0.33781829803679747
- },
- "Planning": {
- "count": 44,
- "num_samples": 714,
- "tasks": [],
- "average_score": 0.08656714113327156
- },
- "Science": {
- "count": 22,
- "num_samples": 469,
- "tasks": [],
- "average_score": 0.2448949597527861
- }
- },
- "input_format": {
- "3D Models and Aerial Imagery": {
- "count": 2,
- "num_samples": 30,
- "tasks": [],
- "average_score": 0.09293971931071163
- },
- "Artistic and Creative Content": {
- "count": 22,
- "num_samples": 389,
- "tasks": [],
- "average_score": 0.2842921728720087
- },
- "Diagrams and Data Visualizations": {
- "count": 88,
- "num_samples": 1524,
- "tasks": [],
- "average_score": 0.23259922343062173
- },
- "Photographs": {
- "count": 83,
- "num_samples": 1315,
- "tasks": [],
- "average_score": 0.36205043973893236
- },
- "Text-Based Images and Documents": {
- "count": 53,
- "num_samples": 847,
- "tasks": [],
- "average_score": 0.13312812081322709
- },
- "User Interface Screenshots": {
- "count": 67,
- "num_samples": 1123,
- "tasks": [],
- "average_score": 0.1930642044577058
- }
- },
- "input_num": {
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.24428672223428244
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 63,
- "num_samples": 975,
- "tasks": [],
- "average_score": 0.1652854805017628
- },
- "exact_text": {
- "count": 57,
- "num_samples": 880,
- "tasks": [],
- "average_score": 0.17061075451792151
- },
- "multiple_choice": {
- "count": 33,
- "num_samples": 567,
- "tasks": [],
- "average_score": 0.42328479601206864
- },
- "numerical_data": {
- "count": 39,
- "num_samples": 694,
- "tasks": [],
- "average_score": 0.23904036592289388
- },
- "open_ended_output": {
- "count": 51,
- "num_samples": 991,
- "tasks": [],
- "average_score": 0.3296071840681468
- },
- "structured_output": {
- "count": 72,
- "num_samples": 1121,
- "tasks": [],
- "average_score": 0.23210528191388644
- }
- },
- "skills": {
- "Commonsense and Social Reasoning": {
- "count": 38,
- "num_samples": 654,
- "tasks": [],
- "average_score": 0.37769658880841467
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 46,
- "num_samples": 897,
- "tasks": [],
- "average_score": 0.21906286524745977
- },
- "Ethical and Safety Reasoning": {
- "count": 10,
- "num_samples": 170,
- "tasks": [],
- "average_score": 0.5887067669172933
- },
- "Language Understanding and Generation": {
- "count": 102,
- "num_samples": 1713,
- "tasks": [],
- "average_score": 0.27091980735233906
- },
- "Mathematical and Logical Reasoning": {
- "count": 91,
- "num_samples": 1630,
- "tasks": [],
- "average_score": 0.19211647307230917
- },
- "Object Recognition and Classification": {
- "count": 172,
- "num_samples": 2714,
- "tasks": [],
- "average_score": 0.25965511679594977
- },
- "Planning and Decision Making": {
- "count": 23,
- "num_samples": 356,
- "tasks": [],
- "average_score": 0.07432337143230854
- },
- "Scene and Event Understanding": {
- "count": 60,
- "num_samples": 1004,
- "tasks": [],
- "average_score": 0.3778480095314066
- },
- "Spatial and Temporal Reasoning": {
- "count": 78,
- "num_samples": 1273,
- "tasks": [],
- "average_score": 0.19305913502727232
- },
- "Text Recognition (OCR)": {
- "count": 101,
- "num_samples": 1687,
- "tasks": [],
- "average_score": 0.2253353309586889
- }
- }
- },
- "Qwen2_VL_72B": {
- "app": {
- "Coding": {
- "count": 16,
- "num_samples": 244,
- "tasks": [],
- "average_score": 0.48352372198879545
- },
- "Information_Extraction": {
- "count": 41,
- "num_samples": 644,
- "tasks": [],
- "average_score": 0.6323628750211533
- },
- "Knowledge": {
- "count": 77,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.4874613649312476
- },
- "Mathematics": {
- "count": 30,
- "num_samples": 497,
- "tasks": [],
- "average_score": 0.3355316008767396
- },
- "Metrics": {
- "count": 3,
- "num_samples": 45,
- "tasks": [],
- "average_score": 0.39365079365079364
- },
- "Perception": {
- "count": 82,
- "num_samples": 1321,
- "tasks": [],
- "average_score": 0.6141225191470527
- },
- "Planning": {
- "count": 44,
- "num_samples": 714,
- "tasks": [],
- "average_score": 0.23323065689783842
- },
- "Science": {
- "count": 22,
- "num_samples": 469,
- "tasks": [],
- "average_score": 0.41914085094672937
- }
- },
- "input_format": {
- "3D Models and Aerial Imagery": {
- "count": 2,
- "num_samples": 30,
- "tasks": [],
- "average_score": 0.18309869697155778
- },
- "Artistic and Creative Content": {
- "count": 22,
- "num_samples": 389,
- "tasks": [],
- "average_score": 0.5251544991587351
- },
- "Diagrams and Data Visualizations": {
- "count": 88,
- "num_samples": 1524,
- "tasks": [],
- "average_score": 0.4473618716373871
- },
- "Photographs": {
- "count": 83,
- "num_samples": 1315,
- "tasks": [],
- "average_score": 0.5308367876160253
- },
- "Text-Based Images and Documents": {
- "count": 53,
- "num_samples": 847,
- "tasks": [],
- "average_score": 0.4333175250859433
- },
- "User Interface Screenshots": {
- "count": 67,
- "num_samples": 1123,
- "tasks": [],
- "average_score": 0.5070634902661117
- }
- },
- "input_num": {
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.4834475464413966
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 63,
- "num_samples": 975,
- "tasks": [],
- "average_score": 0.45605294241715827
- },
- "exact_text": {
- "count": 57,
- "num_samples": 880,
- "tasks": [],
- "average_score": 0.4608929319719144
- },
- "multiple_choice": {
- "count": 33,
- "num_samples": 567,
- "tasks": [],
- "average_score": 0.5851458306003763
- },
- "numerical_data": {
- "count": 39,
- "num_samples": 694,
- "tasks": [],
- "average_score": 0.44066773476234555
- },
- "open_ended_output": {
- "count": 51,
- "num_samples": 991,
- "tasks": [],
- "average_score": 0.4974532098882374
- },
- "structured_output": {
- "count": 72,
- "num_samples": 1121,
- "tasks": [],
- "average_score": 0.49191356756271953
- }
- },
- "skills": {
- "Commonsense and Social Reasoning": {
- "count": 38,
- "num_samples": 654,
- "tasks": [],
- "average_score": 0.5782670824874114
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 46,
- "num_samples": 897,
- "tasks": [],
- "average_score": 0.43580017776139807
- },
- "Ethical and Safety Reasoning": {
- "count": 10,
- "num_samples": 170,
- "tasks": [],
- "average_score": 0.7294097744360902
- },
- "Language Understanding and Generation": {
- "count": 102,
- "num_samples": 1713,
- "tasks": [],
- "average_score": 0.5387802130987105
- },
- "Mathematical and Logical Reasoning": {
- "count": 91,
- "num_samples": 1630,
- "tasks": [],
- "average_score": 0.40095954140813556
- },
- "Object Recognition and Classification": {
- "count": 172,
- "num_samples": 2714,
- "tasks": [],
- "average_score": 0.49559260360544427
- },
- "Planning and Decision Making": {
- "count": 23,
- "num_samples": 356,
- "tasks": [],
- "average_score": 0.1474368760019346
- },
- "Scene and Event Understanding": {
- "count": 60,
- "num_samples": 1004,
- "tasks": [],
- "average_score": 0.6040487985710314
- },
- "Spatial and Temporal Reasoning": {
- "count": 78,
- "num_samples": 1273,
- "tasks": [],
- "average_score": 0.367367170491919
- },
- "Text Recognition (OCR)": {
- "count": 101,
- "num_samples": 1687,
- "tasks": [],
- "average_score": 0.5688395686544739
- }
- }
- },
- "Qwen2_VL_7B": {
- "app": {
- "Coding": {
- "count": 16,
- "num_samples": 244,
- "tasks": [],
- "average_score": 0.36572347689075624
- },
- "Information_Extraction": {
- "count": 41,
- "num_samples": 644,
- "tasks": [],
- "average_score": 0.44618631789079277
- },
- "Knowledge": {
- "count": 77,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.40527029084195965
- },
- "Mathematics": {
- "count": 30,
- "num_samples": 497,
- "tasks": [],
- "average_score": 0.25874500882297563
- },
- "Metrics": {
- "count": 3,
- "num_samples": 45,
- "tasks": [],
- "average_score": 0.3507936507936508
- },
- "Perception": {
- "count": 82,
- "num_samples": 1321,
- "tasks": [],
- "average_score": 0.47845712831657317
- },
- "Planning": {
- "count": 44,
- "num_samples": 714,
- "tasks": [],
- "average_score": 0.13224920829749706
- },
- "Science": {
- "count": 22,
- "num_samples": 469,
- "tasks": [],
- "average_score": 0.28910547521894076
- }
- },
- "input_format": {
- "3D Models and Aerial Imagery": {
- "count": 2,
- "num_samples": 30,
- "tasks": [],
- "average_score": 0.167887099917599
- },
- "Artistic and Creative Content": {
- "count": 22,
- "num_samples": 389,
- "tasks": [],
- "average_score": 0.39575781162159634
- },
- "Diagrams and Data Visualizations": {
- "count": 88,
- "num_samples": 1524,
- "tasks": [],
- "average_score": 0.3279988413468837
- },
- "Photographs": {
- "count": 83,
- "num_samples": 1315,
- "tasks": [],
- "average_score": 0.4722059967533397
- },
- "Text-Based Images and Documents": {
- "count": 53,
- "num_samples": 847,
- "tasks": [],
- "average_score": 0.27651089530142536
- },
- "User Interface Screenshots": {
- "count": 67,
- "num_samples": 1123,
- "tasks": [],
- "average_score": 0.3555822260000372
- }
- },
- "input_num": {
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.3669159632302898
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 63,
- "num_samples": 975,
- "tasks": [],
- "average_score": 0.3343930759222326
- },
- "exact_text": {
- "count": 57,
- "num_samples": 880,
- "tasks": [],
- "average_score": 0.3068323820854221
- },
- "multiple_choice": {
- "count": 33,
- "num_samples": 567,
- "tasks": [],
- "average_score": 0.5064978792251521
- },
- "numerical_data": {
- "count": 39,
- "num_samples": 694,
- "tasks": [],
- "average_score": 0.31569247186288174
- },
- "open_ended_output": {
- "count": 51,
- "num_samples": 991,
- "tasks": [],
- "average_score": 0.39180263622429157
- },
- "structured_output": {
- "count": 72,
- "num_samples": 1121,
- "tasks": [],
- "average_score": 0.38908261098680974
- }
- },
- "skills": {
- "Commonsense and Social Reasoning": {
- "count": 38,
- "num_samples": 654,
- "tasks": [],
- "average_score": 0.4927960336040459
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 46,
- "num_samples": 897,
- "tasks": [],
- "average_score": 0.31735419703044254
- },
- "Ethical and Safety Reasoning": {
- "count": 10,
- "num_samples": 170,
- "tasks": [],
- "average_score": 0.6569285714285714
- },
- "Language Understanding and Generation": {
- "count": 102,
- "num_samples": 1713,
- "tasks": [],
- "average_score": 0.41129100495999377
- },
- "Mathematical and Logical Reasoning": {
- "count": 91,
- "num_samples": 1630,
- "tasks": [],
- "average_score": 0.28799562910106935
- },
- "Object Recognition and Classification": {
- "count": 172,
- "num_samples": 2714,
- "tasks": [],
- "average_score": 0.3844930054666535
- },
- "Planning and Decision Making": {
- "count": 23,
- "num_samples": 356,
- "tasks": [],
- "average_score": 0.08484497782236566
- },
- "Scene and Event Understanding": {
- "count": 60,
- "num_samples": 1004,
- "tasks": [],
- "average_score": 0.5151962864568788
- },
- "Spatial and Temporal Reasoning": {
- "count": 78,
- "num_samples": 1273,
- "tasks": [],
- "average_score": 0.2780300019986884
- },
- "Text Recognition (OCR)": {
- "count": 101,
- "num_samples": 1687,
- "tasks": [],
- "average_score": 0.40533138482347386
- }
- }
- },
- "SmolVLM": {
- "app": {
- "Coding": {
- "count": 16,
- "num_samples": 244,
- "tasks": [],
- "average_score": 0.05390625
- },
- "Information_Extraction": {
- "count": 41,
- "num_samples": 644,
- "tasks": [],
- "average_score": 0.03906165844850793
- },
- "Knowledge": {
- "count": 77,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.09639506190200878
- },
- "Mathematics": {
- "count": 30,
- "num_samples": 497,
- "tasks": [],
- "average_score": 0.06728619034079576
- },
- "Metrics": {
- "count": 3,
- "num_samples": 45,
- "tasks": [],
- "average_score": 0.2222222222222222
- },
- "Perception": {
- "count": 82,
- "num_samples": 1321,
- "tasks": [],
- "average_score": 0.1606753925138995
- },
- "Planning": {
- "count": 44,
- "num_samples": 714,
- "tasks": [],
- "average_score": 0.03272316763696074
- },
- "Science": {
- "count": 22,
- "num_samples": 469,
- "tasks": [],
- "average_score": 0.13950042461525716
- }
- },
- "input_format": {
- "3D Models and Aerial Imagery": {
- "count": 2,
- "num_samples": 30,
- "tasks": [],
- "average_score": 0.10013149786398344
- },
- "Artistic and Creative Content": {
- "count": 22,
- "num_samples": 389,
- "tasks": [],
- "average_score": 0.143657576543239
- },
- "Diagrams and Data Visualizations": {
- "count": 88,
- "num_samples": 1524,
- "tasks": [],
- "average_score": 0.0979843882877799
- },
- "Photographs": {
- "count": 83,
- "num_samples": 1315,
- "tasks": [],
- "average_score": 0.1383108182448921
- },
- "Text-Based Images and Documents": {
- "count": 53,
- "num_samples": 847,
- "tasks": [],
- "average_score": 0.09044016512537822
- },
- "User Interface Screenshots": {
- "count": 67,
- "num_samples": 1123,
- "tasks": [],
- "average_score": 0.029842216842698305
- }
- },
- "input_num": {
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.09605051124900241
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 63,
- "num_samples": 975,
- "tasks": [],
- "average_score": 0.12682789970863723
- },
- "exact_text": {
- "count": 57,
- "num_samples": 880,
- "tasks": [],
- "average_score": 0.05128016118728194
- },
- "multiple_choice": {
- "count": 33,
- "num_samples": 567,
- "tasks": [],
- "average_score": 0.10496742314924135
- },
- "numerical_data": {
- "count": 39,
- "num_samples": 694,
- "tasks": [],
- "average_score": 0.09999979828107199
- },
- "open_ended_output": {
- "count": 51,
- "num_samples": 991,
- "tasks": [],
- "average_score": 0.21315705831839693
- },
- "structured_output": {
- "count": 72,
- "num_samples": 1121,
- "tasks": [],
- "average_score": 0.015386904208215372
- }
- },
- "skills": {
- "Commonsense and Social Reasoning": {
- "count": 38,
- "num_samples": 654,
- "tasks": [],
- "average_score": 0.1293055688222371
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 46,
- "num_samples": 897,
- "tasks": [],
- "average_score": 0.077851045512787
- },
- "Ethical and Safety Reasoning": {
- "count": 10,
- "num_samples": 170,
- "tasks": [],
- "average_score": 0.2222067669172932
- },
- "Language Understanding and Generation": {
- "count": 102,
- "num_samples": 1713,
- "tasks": [],
- "average_score": 0.12889143083611815
- },
- "Mathematical and Logical Reasoning": {
- "count": 91,
- "num_samples": 1630,
- "tasks": [],
- "average_score": 0.0865768026006882
- },
- "Object Recognition and Classification": {
- "count": 172,
- "num_samples": 2714,
- "tasks": [],
- "average_score": 0.10501451629704919
- },
- "Planning and Decision Making": {
- "count": 23,
- "num_samples": 356,
- "tasks": [],
- "average_score": 0.008178053830227744
- },
- "Scene and Event Understanding": {
- "count": 60,
- "num_samples": 1004,
- "tasks": [],
- "average_score": 0.12403047579230878
- },
- "Spatial and Temporal Reasoning": {
- "count": 78,
- "num_samples": 1273,
- "tasks": [],
- "average_score": 0.061765081348496016
- },
- "Text Recognition (OCR)": {
- "count": 101,
- "num_samples": 1687,
- "tasks": [],
- "average_score": 0.08610257462374318
- }
- }
- },
- "llava_onevision_72B": {
- "app": {
- "Coding": {
- "count": 16,
- "num_samples": 244,
- "tasks": [],
- "average_score": 0.3101241538281979
- },
- "Information_Extraction": {
- "count": 41,
- "num_samples": 644,
- "tasks": [],
- "average_score": 0.21993316800752236
- },
- "Knowledge": {
- "count": 77,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.4073185744352188
- },
- "Mathematics": {
- "count": 30,
- "num_samples": 497,
- "tasks": [],
- "average_score": 0.30843360355217414
- },
- "Metrics": {
- "count": 3,
- "num_samples": 45,
- "tasks": [],
- "average_score": 0.4857142857142857
- },
- "Perception": {
- "count": 82,
- "num_samples": 1321,
- "tasks": [],
- "average_score": 0.4151635490932759
- },
- "Planning": {
- "count": 44,
- "num_samples": 714,
- "tasks": [],
- "average_score": 0.14332941205758537
- },
- "Science": {
- "count": 22,
- "num_samples": 469,
- "tasks": [],
- "average_score": 0.34229099411259356
- }
- },
- "input_format": {
- "3D Models and Aerial Imagery": {
- "count": 2,
- "num_samples": 30,
- "tasks": [],
- "average_score": 0.15000864315905132
- },
- "Artistic and Creative Content": {
- "count": 22,
- "num_samples": 389,
- "tasks": [],
- "average_score": 0.48700494939767686
- },
- "Diagrams and Data Visualizations": {
- "count": 88,
- "num_samples": 1524,
- "tasks": [],
- "average_score": 0.3420108320438131
- },
- "Photographs": {
- "count": 83,
- "num_samples": 1315,
- "tasks": [],
- "average_score": 0.46321361231985364
- },
- "Text-Based Images and Documents": {
- "count": 53,
- "num_samples": 847,
- "tasks": [],
- "average_score": 0.1991087184305048
- },
- "User Interface Screenshots": {
- "count": 67,
- "num_samples": 1123,
- "tasks": [],
- "average_score": 0.20630840715151963
- }
- },
- "input_num": {
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.32994677641726666
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 63,
- "num_samples": 975,
- "tasks": [],
- "average_score": 0.2595306800419483
- },
- "exact_text": {
- "count": 57,
- "num_samples": 880,
- "tasks": [],
- "average_score": 0.3154587757748795
- },
- "multiple_choice": {
- "count": 33,
- "num_samples": 567,
- "tasks": [],
- "average_score": 0.5216100397918579
- },
- "numerical_data": {
- "count": 39,
- "num_samples": 694,
- "tasks": [],
- "average_score": 0.29549573982348826
- },
- "open_ended_output": {
- "count": 51,
- "num_samples": 991,
- "tasks": [],
- "average_score": 0.3969569321996683
- },
- "structured_output": {
- "count": 72,
- "num_samples": 1121,
- "tasks": [],
- "average_score": 0.28638031668330033
- }
- },
- "skills": {
- "Commonsense and Social Reasoning": {
- "count": 38,
- "num_samples": 654,
- "tasks": [],
- "average_score": 0.49641793863653866
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 46,
- "num_samples": 897,
- "tasks": [],
- "average_score": 0.34020787956522225
- },
- "Ethical and Safety Reasoning": {
- "count": 10,
- "num_samples": 170,
- "tasks": [],
- "average_score": 0.677251879699248
- },
- "Language Understanding and Generation": {
- "count": 102,
- "num_samples": 1713,
- "tasks": [],
- "average_score": 0.367151258145213
- },
- "Mathematical and Logical Reasoning": {
- "count": 91,
- "num_samples": 1630,
- "tasks": [],
- "average_score": 0.2882162928135965
- },
- "Object Recognition and Classification": {
- "count": 172,
- "num_samples": 2714,
- "tasks": [],
- "average_score": 0.35493339032346644
- },
- "Planning and Decision Making": {
- "count": 23,
- "num_samples": 356,
- "tasks": [],
- "average_score": 0.08886502118921868
- },
- "Scene and Event Understanding": {
- "count": 60,
- "num_samples": 1004,
- "tasks": [],
- "average_score": 0.49931032043437723
- },
- "Spatial and Temporal Reasoning": {
- "count": 78,
- "num_samples": 1273,
- "tasks": [],
- "average_score": 0.28423002295958694
- },
- "Text Recognition (OCR)": {
- "count": 101,
- "num_samples": 1687,
- "tasks": [],
- "average_score": 0.2705047345723313
- }
- }
- },
- "llava_onevision_7B": {
- "app": {
- "Coding": {
- "count": 16,
- "num_samples": 244,
- "tasks": [],
- "average_score": 0.20031585550887018
- },
- "Information_Extraction": {
- "count": 41,
- "num_samples": 644,
- "tasks": [],
- "average_score": 0.1340041159644947
- },
- "Knowledge": {
- "count": 77,
- "num_samples": 1294,
- "tasks": [],
- "average_score": 0.32565632074201306
- },
- "Mathematics": {
- "count": 30,
- "num_samples": 497,
- "tasks": [],
- "average_score": 0.19520567001898761
- },
- "Metrics": {
- "count": 3,
- "num_samples": 45,
- "tasks": [],
- "average_score": 0.5126984126984127
- },
- "Perception": {
- "count": 82,
- "num_samples": 1321,
- "tasks": [],
- "average_score": 0.3545352938542377
- },
- "Planning": {
- "count": 44,
- "num_samples": 714,
- "tasks": [],
- "average_score": 0.10542024755948716
- },
- "Science": {
- "count": 22,
- "num_samples": 469,
- "tasks": [],
- "average_score": 0.27440171167785654
- }
- },
- "input_format": {
- "3D Models and Aerial Imagery": {
- "count": 2,
- "num_samples": 30,
- "tasks": [],
- "average_score": 0.1783310257200802
- },
- "Artistic and Creative Content": {
- "count": 22,
- "num_samples": 389,
- "tasks": [],
- "average_score": 0.39584024260311845
- },
- "Diagrams and Data Visualizations": {
- "count": 88,
- "num_samples": 1524,
- "tasks": [],
- "average_score": 0.252511232938778
- },
- "Photographs": {
- "count": 83,
- "num_samples": 1315,
- "tasks": [],
- "average_score": 0.41346984169922946
- },
- "Text-Based Images and Documents": {
- "count": 53,
- "num_samples": 847,
- "tasks": [],
- "average_score": 0.1159417852705533
- },
- "User Interface Screenshots": {
- "count": 67,
- "num_samples": 1123,
- "tasks": [],
- "average_score": 0.1368238769607056
- }
- },
- "input_num": {
- "1-image": {
- "count": 315,
- "num_samples": 5228,
- "tasks": [],
- "average_score": 0.25687697499702805
- }
- },
- "output_format": {
- "contextual_formatted_text": {
- "count": 63,
- "num_samples": 975,
- "tasks": [],
- "average_score": 0.19203135933620985
- },
- "exact_text": {
- "count": 57,
- "num_samples": 880,
- "tasks": [],
- "average_score": 0.2490174433570946
- },
- "multiple_choice": {
- "count": 33,
- "num_samples": 567,
- "tasks": [],
- "average_score": 0.43553281735099914
- },
- "numerical_data": {
- "count": 39,
- "num_samples": 694,
- "tasks": [],
- "average_score": 0.22047389017098817
- },
- "open_ended_output": {
- "count": 51,
- "num_samples": 991,
- "tasks": [],
- "average_score": 0.3490743804978922
- },
- "structured_output": {
- "count": 72,
- "num_samples": 1121,
- "tasks": [],
- "average_score": 0.19236693222061413
- }
- },
- "skills": {
- "Commonsense and Social Reasoning": {
- "count": 38,
- "num_samples": 654,
- "tasks": [],
- "average_score": 0.4322205869643684
- },
- "Domain-Specific Knowledge and Skills": {
- "count": 46,
- "num_samples": 897,
- "tasks": [],
- "average_score": 0.24367762339842414
- },
- "Ethical and Safety Reasoning": {
- "count": 10,
- "num_samples": 170,
- "tasks": [],
- "average_score": 0.5779849624060149
- },
- "Language Understanding and Generation": {
- "count": 102,
- "num_samples": 1713,
- "tasks": [],
- "average_score": 0.28693734738201987
- },
- "Mathematical and Logical Reasoning": {
- "count": 91,
- "num_samples": 1630,
- "tasks": [],
- "average_score": 0.19593817255686638
- },
- "Object Recognition and Classification": {
- "count": 172,
- "num_samples": 2714,
- "tasks": [],
- "average_score": 0.292593666904816
- },
- "Planning and Decision Making": {
- "count": 23,
- "num_samples": 356,
- "tasks": [],
- "average_score": 0.07666140459493773
- },
- "Scene and Event Understanding": {
- "count": 60,
- "num_samples": 1004,
- "tasks": [],
- "average_score": 0.44333006096492455
- },
- "Spatial and Temporal Reasoning": {
- "count": 78,
- "num_samples": 1273,
- "tasks": [],
- "average_score": 0.2134151671467958
- },
- "Text Recognition (OCR)": {
- "count": 101,
- "num_samples": 1687,
- "tasks": [],
- "average_score": 0.19363816536239586
- }
- }
- }
-}
\ No newline at end of file
diff --git a/static/eval_results/SI/all_summary.json b/static/eval_results/SI/all_summary.json
deleted file mode 100644
index f1fa085799256ca76153f1797c742c435bffb125..0000000000000000000000000000000000000000
--- a/static/eval_results/SI/all_summary.json
+++ /dev/null
@@ -1,509 +0,0 @@
-{
- "Aquila_VL_2B": {
- "core": {
- "num_eval_tasks": 273,
- "num_eval_samples": 4116,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.20770364903712493,
- "micro_mean_score": 0.20333142638522636,
- "missing_tasks": []
- },
- "open": {
- "num_eval_tasks": 42,
- "num_eval_samples": 813,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.31474202723571276,
- "micro_mean_score": 0.3326568265682657,
- "missing_tasks": []
- },
- "overall_score": 0.22197543279693666
- },
- "Aria": {
- "core": {
- "num_eval_tasks": 273,
- "num_eval_samples": 4116,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.3178882776147889,
- "micro_mean_score": 0.3101511832828904,
- "missing_tasks": []
- },
- "open": {
- "num_eval_tasks": 42,
- "num_eval_samples": 813,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.5137437248005172,
- "micro_mean_score": 0.5472939729397295,
- "missing_tasks": []
- },
- "overall_score": 0.34400233723955265
- },
- "Claude_3.5": {
- "core": {
- "num_eval_tasks": 273,
- "num_eval_samples": 4116,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.520276385877485,
- "micro_mean_score": 0.5148202137998056
- },
- "open": {
- "num_eval_tasks": 42,
- "num_eval_samples": 813,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.6479684260295507,
- "micro_mean_score": 0.6801968019680197
- },
- "overall_score": 0.5373019912310938
- },
- "Claude_3.5_new": {
- "core": {
- "num_eval_tasks": 273,
- "num_eval_samples": 4116,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.5462752278980763,
- "micro_mean_score": 0.5417881438289601
- },
- "open": {
- "num_eval_tasks": 42,
- "num_eval_samples": 813,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.6764020657053476,
- "micro_mean_score": 0.6924969249692496
- },
- "overall_score": 0.5636254729390457
- },
- "GPT_4o": {
- "core": {
- "num_eval_tasks": 273,
- "num_eval_samples": 4116,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.5529953662872719,
- "micro_mean_score": 0.5483479105928085
- },
- "open": {
- "num_eval_tasks": 42,
- "num_eval_samples": 813,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.6600228904804206,
- "micro_mean_score": 0.6801968019680197
- },
- "overall_score": 0.5672657028463584
- },
- "GPT_4o_mini": {
- "core": {
- "num_eval_tasks": 273,
- "num_eval_samples": 4116,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.4431039098921726,
- "micro_mean_score": 0.43780369290573373
- },
- "open": {
- "num_eval_tasks": 42,
- "num_eval_samples": 813,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.595574663769726,
- "micro_mean_score": 0.6334563345633456
- },
- "overall_score": 0.46343334374251305
- },
- "Gemini_1.5_flash_002": {
- "core": {
- "num_eval_tasks": 273,
- "num_eval_samples": 4116,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.43481964330318734,
- "micro_mean_score": 0.4297862001943635
- },
- "open": {
- "num_eval_tasks": 42,
- "num_eval_samples": 813,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.5787083135236054,
- "micro_mean_score": 0.6186961869618696
- },
- "overall_score": 0.4540047993325765
- },
- "Gemini_1.5_pro_002": {
- "core": {
- "num_eval_tasks": 273,
- "num_eval_samples": 4116,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.4914311038229404,
- "micro_mean_score": 0.48323615160349853
- },
- "open": {
- "num_eval_tasks": 42,
- "num_eval_samples": 813,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.5814975405131552,
- "micro_mean_score": 0.6174661746617466
- },
- "overall_score": 0.5034399620483024
- },
- "Idefics3": {
- "core": {
- "num_eval_tasks": 273,
- "num_eval_samples": 4116,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.08941182847569326,
- "micro_mean_score": 0.08779475233900695,
- "missing_tasks": []
- },
- "open": {
- "num_eval_tasks": 42,
- "num_eval_samples": 813,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.3231434267517844,
- "micro_mean_score": 0.3618081180811809,
- "missing_tasks": []
- },
- "overall_score": 0.12057604157917208
- },
- "InternVL2_2B": {
- "core": {
- "num_eval_tasks": 273,
- "num_eval_samples": 4116,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.12069001041308772,
- "micro_mean_score": 0.11842605219090299,
- "missing_tasks": []
- },
- "open": {
- "num_eval_tasks": 42,
- "num_eval_samples": 813,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.28522459992910454,
- "micro_mean_score": 0.28886838868388687,
- "missing_tasks": []
- },
- "overall_score": 0.14262795568189
- },
- "InternVL2_76B": {
- "core": {
- "num_eval_tasks": 273,
- "num_eval_samples": 4116,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.3998616568018755,
- "micro_mean_score": 0.39149064302628933,
- "missing_tasks": []
- },
- "open": {
- "num_eval_tasks": 42,
- "num_eval_samples": 813,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.554748737158244,
- "micro_mean_score": 0.5800738007380073,
- "missing_tasks": []
- },
- "overall_score": 0.42051326751605805
- },
- "InternVL2_8B": {
- "core": {
- "num_eval_tasks": 273,
- "num_eval_samples": 4116,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.27650612401825575,
- "micro_mean_score": 0.27119471729837735,
- "missing_tasks": []
- },
- "open": {
- "num_eval_tasks": 42,
- "num_eval_samples": 813,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.39388373890935635,
- "micro_mean_score": 0.4045510455104551,
- "missing_tasks": []
- },
- "overall_score": 0.29215647267040246
- },
- "Llama_3_2_11B": {
- "core": {
- "num_eval_tasks": 273,
- "num_eval_samples": 4116,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.20789144960796493,
- "micro_mean_score": 0.20163641703273802,
- "missing_tasks": []
- },
- "open": {
- "num_eval_tasks": 42,
- "num_eval_samples": 813,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.3861125858565788,
- "micro_mean_score": 0.4130381303813038,
- "missing_tasks": []
- },
- "overall_score": 0.2316542677744468
- },
- "MiniCPM_v2.6": {
- "core": {
- "num_eval_tasks": 273,
- "num_eval_samples": 4116,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.23230765810722817,
- "micro_mean_score": 0.22684118052665975,
- "missing_tasks": []
- },
- "open": {
- "num_eval_tasks": 42,
- "num_eval_samples": 813,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.4360655066213874,
- "micro_mean_score": 0.4588560885608856,
- "missing_tasks": []
- },
- "overall_score": 0.2594753712424494
- },
- "Molmo_72B": {
- "core": {
- "num_eval_tasks": 270,
- "num_eval_samples": 4073,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.36480000609384927,
- "micro_mean_score": 0.36205779758110807,
- "missing_tasks": [
- "table_understanding",
- "MMSoc_Misinformation_PolitiFact",
- "planning_screenshot_termes"
- ]
- },
- "open": {
- "num_eval_tasks": 42,
- "num_eval_samples": 813,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.4465682063915481,
- "micro_mean_score": 0.4850553505535054,
- "missing_tasks": []
- },
- "overall_score": 0.3758072638262318
- },
- "Molmo_7B_D": {
- "core": {
- "num_eval_tasks": 272,
- "num_eval_samples": 4102,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.2098088446992518,
- "micro_mean_score": 0.20550929661464645,
- "missing_tasks": [
- "MMSoc_Misinformation_PolitiFact"
- ]
- },
- "open": {
- "num_eval_tasks": 42,
- "num_eval_samples": 813,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.35697926179118733,
- "micro_mean_score": 0.38936039360393604,
- "missing_tasks": []
- },
- "overall_score": 0.22949405972428777
- },
- "NVLM": {
- "core": {
- "num_eval_tasks": 273,
- "num_eval_samples": 4116,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.32989872890926025,
- "micro_mean_score": 0.32315683713111915,
- "missing_tasks": []
- },
- "open": {
- "num_eval_tasks": 42,
- "num_eval_samples": 813,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.4469349818134809,
- "micro_mean_score": 0.4881303813038132,
- "missing_tasks": []
- },
- "overall_score": 0.34550356262982296
- },
- "POINTS_15_7B": {
- "core": {
- "num_eval_tasks": 273,
- "num_eval_samples": 4116,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.31355970638319003,
- "micro_mean_score": 0.30728203432446294,
- "missing_tasks": []
- },
- "open": {
- "num_eval_tasks": 42,
- "num_eval_samples": 813,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.41331219301389166,
- "micro_mean_score": 0.42749077490774917,
- "missing_tasks": []
- },
- "overall_score": 0.32686003793395024
- },
- "POINTS_7B": {
- "core": {
- "num_eval_tasks": 273,
- "num_eval_samples": 4116,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.25511317681632334,
- "micro_mean_score": 0.24927711632415062,
- "missing_tasks": []
- },
- "open": {
- "num_eval_tasks": 42,
- "num_eval_samples": 813,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.30315625179016,
- "micro_mean_score": 0.3313653136531366,
- "missing_tasks": []
- },
- "overall_score": 0.26151892014616823
- },
- "Phi-3.5-vision": {
- "core": {
- "num_eval_tasks": 273,
- "num_eval_samples": 4116,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.2561274958722834,
- "micro_mean_score": 0.2504214576875906,
- "missing_tasks": []
- },
- "open": {
- "num_eval_tasks": 42,
- "num_eval_samples": 813,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.4272267419054576,
- "micro_mean_score": 0.445879458794588,
- "missing_tasks": []
- },
- "overall_score": 0.2789407286767066
- },
- "Pixtral_12B": {
- "core": {
- "num_eval_tasks": 273,
- "num_eval_samples": 4116,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.3436942439614412,
- "micro_mean_score": 0.3373564384613738,
- "missing_tasks": []
- },
- "open": {
- "num_eval_tasks": 42,
- "num_eval_samples": 813,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.4417271955536318,
- "micro_mean_score": 0.4845633456334564,
- "missing_tasks": []
- },
- "overall_score": 0.3567653041737333
- },
- "Qwen2_VL_2B": {
- "core": {
- "num_eval_tasks": 273,
- "num_eval_samples": 4116,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.22787906973244856,
- "micro_mean_score": 0.2234748515064842,
- "missing_tasks": []
- },
- "open": {
- "num_eval_tasks": 42,
- "num_eval_samples": 813,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.3509364634962041,
- "micro_mean_score": 0.3768757687576875,
- "missing_tasks": []
- },
- "overall_score": 0.24428672223428263
- },
- "Qwen2_VL_72B": {
- "core": {
- "num_eval_tasks": 273,
- "num_eval_samples": 4116,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.4730536307784527,
- "micro_mean_score": 0.4659830915476831,
- "missing_tasks": []
- },
- "open": {
- "num_eval_tasks": 42,
- "num_eval_samples": 813,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.5510079982505317,
- "micro_mean_score": 0.5826568265682657,
- "missing_tasks": []
- },
- "overall_score": 0.48344754644139654
- },
- "Qwen2_VL_7B": {
- "core": {
- "num_eval_tasks": 273,
- "num_eval_samples": 4116,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.3538656561495699,
- "micro_mean_score": 0.34581250459157137,
- "missing_tasks": []
- },
- "open": {
- "num_eval_tasks": 42,
- "num_eval_samples": 813,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.4517429592549692,
- "micro_mean_score": 0.4730012300123002,
- "missing_tasks": []
- },
- "overall_score": 0.3669159632302898
- },
- "SmolVLM": {
- "core": {
- "num_eval_tasks": 273,
- "num_eval_samples": 4116,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.07348385181460795,
- "micro_mean_score": 0.0732694668402814,
- "missing_tasks": []
- },
- "open": {
- "num_eval_tasks": 42,
- "num_eval_samples": 813,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.2427337975725658,
- "micro_mean_score": 0.2504920049200492,
- "missing_tasks": []
- },
- "overall_score": 0.09605051124900234
- },
- "llava_onevision_72B": {
- "core": {
- "num_eval_tasks": 273,
- "num_eval_samples": 4116,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.312618242621264,
- "micro_mean_score": 0.3098623876487132,
- "missing_tasks": []
- },
- "open": {
- "num_eval_tasks": 42,
- "num_eval_samples": 813,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.4425822460912829,
- "micro_mean_score": 0.47539975399754,
- "missing_tasks": []
- },
- "overall_score": 0.32994677641726655
- },
- "llava_onevision_7B": {
- "core": {
- "num_eval_tasks": 273,
- "num_eval_samples": 4116,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.23683339637631812,
- "micro_mean_score": 0.23283041278687175,
- "missing_tasks": []
- },
- "open": {
- "num_eval_tasks": 42,
- "num_eval_samples": 813,
- "num_not_eval_samples": 0,
- "macro_mean_score": 0.3871602360316429,
- "micro_mean_score": 0.4113161131611316,
- "missing_tasks": []
- },
- "overall_score": 0.25687697499702805
- }
-}
\ No newline at end of file
diff --git a/static/eval_results/SI/llava_onevision_72B/summary_results.json b/static/eval_results/SI/llava_onevision_72B/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..c67b592ab42608c0b29d97657f792f219073bd8b
--- /dev/null
+++ b/static/eval_results/SI/llava_onevision_72B/summary_results.json
@@ -0,0 +1,219 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 273,
+ "num_eval_samples": 4116,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.312618242621264,
+ "micro_mean_score": 0.3098623876487132,
+ "missing_tasks": []
+ },
+ "open": {
+ "num_eval_tasks": 42,
+ "num_eval_samples": 813,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.4425822460912829,
+ "micro_mean_score": 0.47539975399754,
+ "missing_tasks": []
+ },
+ "overall_score": 0.32994677641726655
+ },
+ "keyword_stats": {
+ "skills": {
+ "Text Recognition (OCR)": {
+ "count": 101,
+ "num_samples": 1687,
+ "tasks": [],
+ "average_score": 0.2705047345723313
+ },
+ "Object Recognition and Classification": {
+ "count": 172,
+ "num_samples": 2714,
+ "tasks": [],
+ "average_score": 0.35493339032346644
+ },
+ "Scene and Event Understanding": {
+ "count": 60,
+ "num_samples": 1004,
+ "tasks": [],
+ "average_score": 0.49931032043437723
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 91,
+ "num_samples": 1630,
+ "tasks": [],
+ "average_score": 0.2882162928135965
+ },
+ "Language Understanding and Generation": {
+ "count": 102,
+ "num_samples": 1713,
+ "tasks": [],
+ "average_score": 0.367151258145213
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 46,
+ "num_samples": 897,
+ "tasks": [],
+ "average_score": 0.34020787956522225
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 78,
+ "num_samples": 1273,
+ "tasks": [],
+ "average_score": 0.28423002295958694
+ },
+ "Planning and Decision Making": {
+ "count": 23,
+ "num_samples": 356,
+ "tasks": [],
+ "average_score": 0.08886502118921868
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 38,
+ "num_samples": 654,
+ "tasks": [],
+ "average_score": 0.49641793863653866
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 10,
+ "num_samples": 170,
+ "tasks": [],
+ "average_score": 0.677251879699248
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 67,
+ "num_samples": 1123,
+ "tasks": [],
+ "average_score": 0.20630840715151963
+ },
+ "Text-Based Images and Documents": {
+ "count": 53,
+ "num_samples": 847,
+ "tasks": [],
+ "average_score": 0.1991087184305048
+ },
+ "Photographs": {
+ "count": 83,
+ "num_samples": 1315,
+ "tasks": [],
+ "average_score": 0.46321361231985364
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 88,
+ "num_samples": 1524,
+ "tasks": [],
+ "average_score": 0.3420108320438131
+ },
+ "Artistic and Creative Content": {
+ "count": 22,
+ "num_samples": 389,
+ "tasks": [],
+ "average_score": 0.48700494939767686
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 2,
+ "num_samples": 30,
+ "tasks": [],
+ "average_score": 0.15000864315905132
+ }
+ },
+ "output_format": {
+ "structured_output": {
+ "count": 72,
+ "num_samples": 1121,
+ "tasks": [],
+ "average_score": 0.28638031668330033
+ },
+ "contextual_formatted_text": {
+ "count": 63,
+ "num_samples": 975,
+ "tasks": [],
+ "average_score": 0.2595306800419483
+ },
+ "exact_text": {
+ "count": 57,
+ "num_samples": 880,
+ "tasks": [],
+ "average_score": 0.3154587757748795
+ },
+ "numerical_data": {
+ "count": 39,
+ "num_samples": 694,
+ "tasks": [],
+ "average_score": 0.29549573982348826
+ },
+ "open_ended_output": {
+ "count": 51,
+ "num_samples": 991,
+ "tasks": [],
+ "average_score": 0.3969569321996683
+ },
+ "multiple_choice": {
+ "count": 33,
+ "num_samples": 567,
+ "tasks": [],
+ "average_score": 0.5216100397918579
+ }
+ },
+ "input_num": {
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.32994677641726666
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 41,
+ "num_samples": 644,
+ "tasks": [],
+ "average_score": 0.21993316800752236
+ },
+ "Planning": {
+ "count": 44,
+ "num_samples": 714,
+ "tasks": [],
+ "average_score": 0.14332941205758537
+ },
+ "Coding": {
+ "count": 16,
+ "num_samples": 244,
+ "tasks": [],
+ "average_score": 0.3101241538281979
+ },
+ "Perception": {
+ "count": 82,
+ "num_samples": 1321,
+ "tasks": [],
+ "average_score": 0.4151635490932759
+ },
+ "Metrics": {
+ "count": 3,
+ "num_samples": 45,
+ "tasks": [],
+ "average_score": 0.4857142857142857
+ },
+ "Science": {
+ "count": 22,
+ "num_samples": 469,
+ "tasks": [],
+ "average_score": 0.34229099411259356
+ },
+ "Knowledge": {
+ "count": 77,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.4073185744352188
+ },
+ "Mathematics": {
+ "count": 30,
+ "num_samples": 497,
+ "tasks": [],
+ "average_score": 0.30843360355217414
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/SI/llava_onevision_72B/task_results.json b/static/eval_results/SI/llava_onevision_72B/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..2e64e10176bf69dd6c2f3cd47aa2b99e41d457ad
--- /dev/null
+++ b/static/eval_results/SI/llava_onevision_72B/task_results.json
@@ -0,0 +1,2207 @@
+[
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.21666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.08163265306122448,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.76,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.20833333333333334,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.4892773511051451,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.08573157203238838,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "dvqa",
+ "score": 0.42105263157894735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.4444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.5480340058696004,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.35064935064935066,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.2657477500954247,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.7333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.6279999999999999,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.6010216430095937,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.12184873949579833,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.36842105263157904,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.2380952380952381,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.5495762050605968,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "figureqa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.43773684210526315,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.4885714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.47368421052631576,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.0642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_parity",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.84375,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.125,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.40873015873015867,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "algebra",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 1.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.7666714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.4807857142857144,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.2502288590795625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.6293368421052632,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "iconqa",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.16964285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.2222222222222222,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.33035714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.5777777777777777,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.1111111111111111,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.05714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.12857142857142856,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.014285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.7777777777777778,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.009120898057654902,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.8761904761904761,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.20408163265306123,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.18336944138598113,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.3,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.10204081632653061,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.7894736842105263,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counting",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.4523809523809524,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.20202020202020202,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.2843137254901961,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.6034155146542303,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.3027210884353741,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_execution",
+ "score": 0.1875,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.16666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.021835265116335066,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.6071428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.18421052631578946,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.5882352941176471,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.17647058823529413,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "rebus",
+ "score": 0.21739130434782608,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.8666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.6904761904761906,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.028571428571428574,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6000000000000001,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.6666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.7857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.41913265306122444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.31428571428571433,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.4117647058823529,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.23999999999999996,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.27142857142857146,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.7071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.8357142857142857,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.6500000000000001,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.6965517241379311,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.49999999999999994,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.7241379310344828,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.4928571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.6413793103448276,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.710344827586207,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.6642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.5413793103448277,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "iq_test",
+ "score": 0.18620689655172415,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.7241379310344829,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.3548387096774194,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.4482758620689657,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.5714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.28666666666666657,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.6230769230769231,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.38888888888888884,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.35,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.6689655172413793,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.8052631578947368,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.5517241379310345,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.06428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.3928571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.15,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.028571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.10714285714285714,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.07857142857142856,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.049999999999999996,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.06428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.07857142857142856,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.4785714285714286,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.565,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.11052631578947371,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.535,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.6349999999999999,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8789473684210528,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.665,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.07142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ }
+]
\ No newline at end of file
diff --git a/static/eval_results/SI/llava_onevision_7B/summary_results.json b/static/eval_results/SI/llava_onevision_7B/summary_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a8707a14e183e8e3fb3dbb2232b4d1fca07b301
--- /dev/null
+++ b/static/eval_results/SI/llava_onevision_7B/summary_results.json
@@ -0,0 +1,219 @@
+{
+ "model_summary": {
+ "core": {
+ "num_eval_tasks": 273,
+ "num_eval_samples": 4116,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.23683339637631812,
+ "micro_mean_score": 0.23283041278687175,
+ "missing_tasks": []
+ },
+ "open": {
+ "num_eval_tasks": 42,
+ "num_eval_samples": 813,
+ "num_not_eval_samples": 0,
+ "macro_mean_score": 0.3871602360316429,
+ "micro_mean_score": 0.4113161131611316,
+ "missing_tasks": []
+ },
+ "overall_score": 0.25687697499702805
+ },
+ "keyword_stats": {
+ "skills": {
+ "Text Recognition (OCR)": {
+ "count": 101,
+ "num_samples": 1687,
+ "tasks": [],
+ "average_score": 0.19363816536239586
+ },
+ "Object Recognition and Classification": {
+ "count": 172,
+ "num_samples": 2714,
+ "tasks": [],
+ "average_score": 0.292593666904816
+ },
+ "Scene and Event Understanding": {
+ "count": 60,
+ "num_samples": 1004,
+ "tasks": [],
+ "average_score": 0.44333006096492455
+ },
+ "Mathematical and Logical Reasoning": {
+ "count": 91,
+ "num_samples": 1630,
+ "tasks": [],
+ "average_score": 0.19593817255686638
+ },
+ "Language Understanding and Generation": {
+ "count": 102,
+ "num_samples": 1713,
+ "tasks": [],
+ "average_score": 0.28693734738201987
+ },
+ "Domain-Specific Knowledge and Skills": {
+ "count": 46,
+ "num_samples": 897,
+ "tasks": [],
+ "average_score": 0.24367762339842414
+ },
+ "Spatial and Temporal Reasoning": {
+ "count": 78,
+ "num_samples": 1273,
+ "tasks": [],
+ "average_score": 0.2134151671467958
+ },
+ "Planning and Decision Making": {
+ "count": 23,
+ "num_samples": 356,
+ "tasks": [],
+ "average_score": 0.07666140459493773
+ },
+ "Commonsense and Social Reasoning": {
+ "count": 38,
+ "num_samples": 654,
+ "tasks": [],
+ "average_score": 0.4322205869643684
+ },
+ "Ethical and Safety Reasoning": {
+ "count": 10,
+ "num_samples": 170,
+ "tasks": [],
+ "average_score": 0.5779849624060149
+ }
+ },
+ "input_format": {
+ "User Interface Screenshots": {
+ "count": 67,
+ "num_samples": 1123,
+ "tasks": [],
+ "average_score": 0.1368238769607056
+ },
+ "Text-Based Images and Documents": {
+ "count": 53,
+ "num_samples": 847,
+ "tasks": [],
+ "average_score": 0.1159417852705533
+ },
+ "Photographs": {
+ "count": 83,
+ "num_samples": 1315,
+ "tasks": [],
+ "average_score": 0.41346984169922946
+ },
+ "Diagrams and Data Visualizations": {
+ "count": 88,
+ "num_samples": 1524,
+ "tasks": [],
+ "average_score": 0.252511232938778
+ },
+ "Artistic and Creative Content": {
+ "count": 22,
+ "num_samples": 389,
+ "tasks": [],
+ "average_score": 0.39584024260311845
+ },
+ "3D Models and Aerial Imagery": {
+ "count": 2,
+ "num_samples": 30,
+ "tasks": [],
+ "average_score": 0.1783310257200802
+ }
+ },
+ "output_format": {
+ "structured_output": {
+ "count": 72,
+ "num_samples": 1121,
+ "tasks": [],
+ "average_score": 0.19236693222061413
+ },
+ "contextual_formatted_text": {
+ "count": 63,
+ "num_samples": 975,
+ "tasks": [],
+ "average_score": 0.19203135933620985
+ },
+ "exact_text": {
+ "count": 57,
+ "num_samples": 880,
+ "tasks": [],
+ "average_score": 0.2490174433570946
+ },
+ "numerical_data": {
+ "count": 39,
+ "num_samples": 694,
+ "tasks": [],
+ "average_score": 0.22047389017098817
+ },
+ "open_ended_output": {
+ "count": 51,
+ "num_samples": 991,
+ "tasks": [],
+ "average_score": 0.3490743804978922
+ },
+ "multiple_choice": {
+ "count": 33,
+ "num_samples": 567,
+ "tasks": [],
+ "average_score": 0.43553281735099914
+ }
+ },
+ "input_num": {
+ "1-image": {
+ "count": 315,
+ "num_samples": 5228,
+ "tasks": [],
+ "average_score": 0.25687697499702805
+ }
+ },
+ "app": {
+ "Information_Extraction": {
+ "count": 41,
+ "num_samples": 644,
+ "tasks": [],
+ "average_score": 0.1340041159644947
+ },
+ "Planning": {
+ "count": 44,
+ "num_samples": 714,
+ "tasks": [],
+ "average_score": 0.10542024755948716
+ },
+ "Coding": {
+ "count": 16,
+ "num_samples": 244,
+ "tasks": [],
+ "average_score": 0.20031585550887018
+ },
+ "Perception": {
+ "count": 82,
+ "num_samples": 1321,
+ "tasks": [],
+ "average_score": 0.3545352938542377
+ },
+ "Metrics": {
+ "count": 3,
+ "num_samples": 45,
+ "tasks": [],
+ "average_score": 0.5126984126984127
+ },
+ "Science": {
+ "count": 22,
+ "num_samples": 469,
+ "tasks": [],
+ "average_score": 0.27440171167785654
+ },
+ "Knowledge": {
+ "count": 77,
+ "num_samples": 1294,
+ "tasks": [],
+ "average_score": 0.32565632074201306
+ },
+ "Mathematics": {
+ "count": 30,
+ "num_samples": 497,
+ "tasks": [],
+ "average_score": 0.19520567001898761
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/static/eval_results/SI/llava_onevision_7B/task_results.json b/static/eval_results/SI/llava_onevision_7B/task_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..dbab1adf3b34fe5c4cd1fe0d5f48efeb76f3eacc
--- /dev/null
+++ b/static/eval_results/SI/llava_onevision_7B/task_results.json
@@ -0,0 +1,2207 @@
+[
+ {
+ "name": "science_molecule_chemistry",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "signboard_identification",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "funsd_document_qa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "physical_property_reasoning",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_area",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_analytic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_selection",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "map_diagram_qa",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_connectivity",
+ "score": 0.16666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "graph_isomorphism",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "question_solution_solving",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "snli_ve_visual_entailment",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ti_fused_vqa_chemistry",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ili_ratio_future_prediction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_style_recognition",
+ "score": 0.9333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "multilingual_movie_info_parsing",
+ "score": 0.11224489795918366,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_transformation",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "vibe_eval_short_phrase",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_breakpoint",
+ "score": 0.3333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "waybill_number_sequence_extraction",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "brand_logo_recognition_and_elaboration",
+ "score": 0.54,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "license_plate_recognition",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzle_single_step",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "code_programming_test_easy",
+ "score": 0.041666666666666664,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 24
+ },
+ {
+ "name": "chess_winner_identification",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chart_vqa",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_convexity_value_estimation",
+ "score": 0.24958220225240815,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "3d_indoor_scene_text_bbox_prediction",
+ "score": 0.07094776572587472,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "long_string_letter_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "dvqa",
+ "score": 0.42105263157894735,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "egocentric_spatial_reasoning",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "quizlet_question_solving",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_person_detection",
+ "score": 0.5626373174966284,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_maxflow",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "tqa_textbook_qa",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "game_info_parsing",
+ "score": 0.2727272727272727,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "animal_pose_estimation",
+ "score": 0.21324372091628846,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "humor_understand_caption_match",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "table_understanding",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "super_clevr",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "average_humidity_estimate_plot",
+ "score": 0.09466666666666665,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "nlvr2_two_image_compare_qa",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "face_keypoint_detection",
+ "score": 0.848854419078294,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "insect_order_classification",
+ "score": 0.13333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "deciphering_oracle_bone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "places365_scene_type_classification",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_basic_physics",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "long_string_number_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "handwritten_math_expression_extraction",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_ocr_in_query_box",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "graph_theory",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "landmark_recognition_and_qa",
+ "score": 0.24444444444444444,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_solid",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_rated_hotel_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_info_parsing",
+ "score": 0.046218487394957986,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pmc_vqa_medical_image_qa",
+ "score": 0.631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "electricity_future_prediction_from_table",
+ "score": 0.3899999999999999,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "TV_show_info_parsing",
+ "score": 0.30158730158730157,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "coco_object_detection_by_query_property",
+ "score": 0.4755460318827545,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "figureqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_biology",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "traffic_future_prediction_from_line_plot",
+ "score": 0.4832631578947368,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "mvsa_sentiment_classification",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "stock_price_future_prediction",
+ "score": 0.12864285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "kvqa_knowledge_aware_qa",
+ "score": 0.631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "hotel_booking_confirmation_parsing",
+ "score": 0.0071428571428571435,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_planar",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "math_parity",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "famous_building_recognition",
+ "score": 0.40625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "multilingual_game_info_parsing",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "weather_info_parsing",
+ "score": 0.17063492063492064,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_shortest_path_kamada_kawai",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "widerface_face_count_and_event_classification",
+ "score": 0.4642857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_physics",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "graph_chordless_cycle",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_length",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "algebra",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "human_relationship_reasoning",
+ "score": 0.75,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "exchange_rate_estimate_plot",
+ "score": 0.36628571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_load_estimate_plot",
+ "score": 0.37007142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "single_person_pose_estimation",
+ "score": 0.11252203631219156,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ti_fused_vqa_math",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "electricity_plot_future_prediction",
+ "score": 0.4404578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "iconqa",
+ "score": 0.5263157894736842,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "movie_info_parsing",
+ "score": 0.16071428571428573,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "youtube_video_info_parsing",
+ "score": 0.05952380952380952,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "egocentric_analysis_single_image",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "music_info_parsing",
+ "score": 0.13392857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mnist_pattern",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "newspaper_page_parse_and_count",
+ "score": 0.35555555555555557,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "geometry_descriptive",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_relation",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_depth",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_distance",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cvbench_adapted_cvbench_count",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_computer_aided_design",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
+ "score": 0.16666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 18
+ },
+ {
+ "name": "multiple_states_identify_asia",
+ "score": 0.028571428571428574,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_americas",
+ "score": 0.1,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_europe",
+ "score": 0.014285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multiple_states_identify_africa",
+ "score": 0.014285714285714287,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "flowchart_code_generation",
+ "score": 0.5555555555555556,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "healthcare_info_judgement",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_depth_of_different_points",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_visual_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "Ad_count_detection",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_sygyzy_endgames",
+ "score": 0.09455782312925169,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_grounding",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "position_relationship",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Web_Multi",
+ "score": 0.39285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "webpage_code_understanding",
+ "score": 0.6666666666666666,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "extract_webpage_headline",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_sheet_note_count",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "geometry_reasoning_circled_letter",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "logical_reasoning_find_odd_one_out",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "paper_vqa",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "remaining_playback_time_calculation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "interpret_force_perspective_illusion",
+ "score": 0.6,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Web_Single",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "hashtag_recommendation",
+ "score": 0.8642857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_tyreworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "weather_map_climate_type_temperature_parsing",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_calculus_wo_solution",
+ "score": 0.061224489795918366,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "MMMU_physics_chemistry_MCQ",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_termes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "image_translation_en2cn",
+ "score": 0.09927221295148407,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "game_platform_support_identification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "music_sheet_format_QA",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "cultural_vqa",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "CLEVRER_physics",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "location_vqa",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_nested_squares",
+ "score": 0.10714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "circuit_diagram_understanding",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "scibench_fundamental_wo_solution",
+ "score": 0.10204081632653061,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 49
+ },
+ {
+ "name": "medical_cell_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_barman",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "planning_screenshot_blocksworld",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "planning_screenshot_storage",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "distinguish_ai_generated_image",
+ "score": 0.631578947368421,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "geometry_reasoning_count_line_intersections",
+ "score": 0.5357142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_grippers",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "llavaguard",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mindmap_elements_parsing",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autonomous_driving_scene_analysis",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "realworld_qa_en2cn",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "orchestra_score_recognition",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "highest_discount_game_price_identification",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counting",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "media_QA_web_stackoverflow",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "soccer_offside",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "song_title_identification_from_lyrics",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMMU_pro_exam_screenshot",
+ "score": 0.13131313131313133,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 99
+ },
+ {
+ "name": "medical_multi_organ_segmentation_rater",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "knowledge_graph_understanding",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "mensa_iq_test",
+ "score": 0.17107843137254902,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "play_go_capture_stone",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "medical_blood_vessels_recognition",
+ "score": 0.25,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "web_action_prediction",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "arxiv_vqa",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "top_video_creator_identification",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "annoying_word_search",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "GUI_Act_Mobile_tap",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_grid",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "planning_screenshot_floortile",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Act_Mobile_swipe",
+ "score": 0.5569127613427832,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "recover_masked_word_in_figure",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "character_recognition_in_TV_shows",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "relative_reflectance_of_different_regions",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_image_artifacts_indentification",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "signage_navigation",
+ "score": 0.5333333333333333,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "transit_map_intersection_points",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "code_execution",
+ "score": 0.0625,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 16
+ },
+ {
+ "name": "icon_arithmetic_puzzle",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "geometry_reasoning_overlapped_circle",
+ "score": 0.32142857142857145,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chinese_idiom_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "monthly_weather_days_count",
+ "score": 0.2380952380952381,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "calendar_schedule_suggestion",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ascii_art_understanding",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_find_legal_moves",
+ "score": 0.03317029264010414,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "topological_sort",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "entertainment_web_game_style",
+ "score": 0.17857142857142858,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "polygon_interior_angles",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "mahjong",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "font_recognition",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_comparison",
+ "score": 0.42857142857142855,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "medical_counting_lymphocytes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "actor_recognition_in_Movie",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "product_ocr_qa",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "Bongard_Problem",
+ "score": 0.15789473684210525,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "logical_reasoning_fit_pattern",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "road_map_find_highway_between_two_place",
+ "score": 0.23529411764705882,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "ishihara_test",
+ "score": 0.4571428571428572,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "LaTeX_complex_formula_convertion",
+ "score": 0.058823529411764705,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "rebus",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 23
+ },
+ {
+ "name": "constrained_generation_contain_position_length",
+ "score": 0.2,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_repeat_length",
+ "score": 0.26666666666666666,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_contain_length",
+ "score": 0.8,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "constrained_generation_contain_length",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_equality",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "chess_puzzles_crushing",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "chess_puzzles_checkmate",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_single_question",
+ "score": 0.8571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "panel_images_multi_question",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "crossword_mini_5x5",
+ "score": 0.0071428571428571435,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_HatefulMemes",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_GossipCop",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Misinformation_PolitiFact",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MMSoc_Memotion",
+ "score": 0.6470588235294118,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "reward_models_I2T_reward",
+ "score": 0.7142857142857143,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_MATH",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "ocr_math_TheoremQA",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "counterfactual_arithmetic",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_homepage",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_publication",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "research_website_parsing_blogpost",
+ "score": 0.21428571428571427,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_complex_question_answering",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fact_verification",
+ "score": 0.619047619047619,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_swap",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_style",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_veracity",
+ "score": 0.9285714285714286,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_face_attribute_edit",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_text_entity_replace",
+ "score": 0.5714285714285714,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_out_of_context",
+ "score": 0.6428571428571429,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "MFC_Bench_check_background_change",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "number_puzzle_kakuro_5x5",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "number_puzzle_sudoku",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "maze_2d_8x8",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_shapes",
+ "score": 0.17729591836734696,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "shape_composition_colours",
+ "score": 0.3034297052154195,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_chinese_celebrity",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_indian_celebrity",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "memorization_papers",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "memorization_famous_treaty",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_csv",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_school_plain",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_text_latex",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_skill_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_experience_plain",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_authors",
+ "score": 0.03571428571428571,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_markdown",
+ "score": 0.2857142857142857,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_html",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_resume_employer_plain",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_math_equation",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_article_journal",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocr_table_to_latex",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact",
+ "score": 0.35714285714285715,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "autorater_artifact_reason",
+ "score": 0.4666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_haiku",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_limerick",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_petrarchian_sonnet_optional_meter",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_custom_rhyming_scheme",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_acrostic_alliteration",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "poetry_shakespearean_sonnet",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 0,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_cartoon_drawing_guess",
+ "score": 0.5,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_doodle_guess",
+ "score": 0.4,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "pictionary_skribbl_io",
+ "score": 0.15,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 20
+ },
+ {
+ "name": "pictionary_genai_output_chinese",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "pictionary_chinese_food_img2en",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_tiktok",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_notes",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_alipay",
+ "score": 0.11764705882352941,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 17
+ },
+ {
+ "name": "app_layout_understanding_amazon",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_instagram",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_ppt",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_leetcode",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_twitter",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_zoom",
+ "score": 0.06666666666666667,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "app_layout_understanding_iphone_settings",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_youtube",
+ "score": 0.14285714285714285,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_excel",
+ "score": 0.07142857142857142,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "app_layout_understanding_word",
+ "score": 0.0,
+ "eval_type": "rule",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "scibench_w_solution_open_ended",
+ "score": 0.15999999999999998,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 25
+ },
+ {
+ "name": "electrocardiogram",
+ "score": 0.27142857142857146,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "funny_image_title",
+ "score": 0.5642857142857143,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_captioning_with_additional_requirements",
+ "score": 0.6142857142857142,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "meme_explain",
+ "score": 0.6071428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "image_humor_understanding",
+ "score": 0.5896551724137932,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "tweets_captioning",
+ "score": 0.5499999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "defeasible_reasoning",
+ "score": 0.5344827586206896,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "vibe-eval",
+ "score": 0.2928571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "ocrqa",
+ "score": 0.5172413793103449,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "figurative_speech_explanation",
+ "score": 0.6275862068965519,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "docci_image_description_long",
+ "score": 0.6714285714285715,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bar_chart_interpretation",
+ "score": 0.35172413793103446,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "iq_test",
+ "score": 0.30344827586206885,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "unusual_images",
+ "score": 0.5172413793103448,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "GUI_Chat_Hard",
+ "score": 0.5161290322580645,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 31
+ },
+ {
+ "name": "graph_interpretation",
+ "score": 0.21724137931034485,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "traffic_accident_analysis",
+ "score": 0.7071428571428573,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "humor_explanation",
+ "score": 0.5399999999999999,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 15
+ },
+ {
+ "name": "GUI_Chat_Easy",
+ "score": 0.5384615384615384,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 26
+ },
+ {
+ "name": "table2latex_complex",
+ "score": 0.2888888888888889,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 9
+ },
+ {
+ "name": "visualization_with_code",
+ "score": 0.29285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "science_figure_explanation",
+ "score": 0.4620689655172414,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "guess_image_generation_prompt",
+ "score": 0.7421052631578948,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "art_explanation",
+ "score": 0.33103448275862063,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 29
+ },
+ {
+ "name": "bridge_strategies_worldclass",
+ "score": 0.0071428571428571435,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_expert",
+ "score": 0.4428571428571428,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "bridge_strategies_advanced",
+ "score": 0.03571428571428571,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+ "score": 0.0,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+ "score": 0.028571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+ "score": 0.021428571428571432,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+ "score": 0.014285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
+ "score": 0.028571428571428574,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
+ "score": 0.014285714285714287,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "table_understanding_fetaqa",
+ "score": 0.1928571428571429,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ },
+ {
+ "name": "red_teaming_racial",
+ "score": 0.7300000000000001,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_captcha",
+ "score": 0.10000000000000003,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_celebrity",
+ "score": 0.6150000000000002,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_politics",
+ "score": 0.655,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "red_teaming_visualmisleading",
+ "score": 0.8684210526315791,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 19
+ },
+ {
+ "name": "red_teaming_jailbreak",
+ "score": 0.555,
+ "eval_type": "llm",
+ "num_demo": 0,
+ "num_query": 20
+ },
+ {
+ "name": "ascii_art_30",
+ "score": 0.14285714285714285,
+ "eval_type": "llm",
+ "num_demo": 1,
+ "num_query": 14
+ }
+]
\ No newline at end of file
diff --git a/utils.py b/utils.py
index c1e6d15789aaf7b0b3bec09ced62662de9ef6759..a33c572a63f4bd4b13e5ceeb7b3636fde093d31d 100644
--- a/utils.py
+++ b/utils.py
@@ -1,6 +1,7 @@
import pandas as pd
import json
from typing import Dict, Any, Tuple
+import os
# Keep all the constant mappings outside the class
MODEL_NAME_MAP = {
@@ -116,6 +117,8 @@ MODEL_URLS = {
"POINTS_15_7B": "https://huggingface.co./WePOINTS/POINTS-1-5-Qwen-2-5-7B-Chat",
"SmolVLM": "https://huggingface.co./HuggingFaceTB/SmolVLM-Instruct",
"Mammoth_VL": "https://huggingface.co./MAmmoTH-VL/MAmmoTH-VL-8B",
+ "InternVL2_5_78B": "https://huggingface.co./OpenGVLab/InternVL2_5-78B",
+ "InternVL2_5_2B": "https://huggingface.co./OpenGVLab/InternVL2_5-2B",
}
class BaseDataLoader:
@@ -190,52 +193,37 @@ class BaseDataLoader:
def _load_summary_data(self) -> Dict[str, Any]:
raise NotImplementedError("Subclasses must implement _load_summary_data")
- def get_df(self, selected_super_group: str, selected_model_group: str) -> pd.DataFrame:
- raise NotImplementedError("Subclasses must implement get_df")
-
- def get_leaderboard_data(self, selected_super_group: str, selected_model_group: str) -> Tuple[list, list]:
- raise NotImplementedError("Subclasses must implement get_leaderboard_data")
-
-
-class DefaultDataLoader(BaseDataLoader):
- def __init__(self):
- super().__init__()
-
- def _load_model_data(self) -> Dict[str, Any]:
- with open("./static/eval_results/Default/all_model_keywords_stats.json", "r") as f:
- return json.load(f)
-
- def _load_summary_data(self) -> Dict[str, Any]:
- with open("./static/eval_results/Default/all_summary.json", "r") as f:
- return json.load(f)
-
def get_df(self, selected_super_group: str, selected_model_group: str) -> pd.DataFrame:
original_dimension = get_original_dimension(selected_super_group)
data = []
+
for model in self.MODEL_GROUPS[selected_model_group]:
+ if model not in self.MODEL_DATA or model not in self.SUMMARY_DATA:
+ continue
+
model_data = self.MODEL_DATA[model]
summary = self.SUMMARY_DATA[model]
- if summary["core_noncot"]:
- core_noncot_score = summary["core_noncot"]["macro_mean_score"]
- else:
- core_noncot_score = '-'
- if summary["core_cot"]:
- core_cot_score = summary["core_cot"]["macro_mean_score"]
- else:
- core_cot_score = '-'
+
+ # Basic model information
row = {
"Models": get_display_model_name(model, as_link=True),
"Overall": round(summary["overall_score"] * 100, 2),
- "Core w/o CoT": round(core_noncot_score * 100, 2) if core_noncot_score != '-' else '-',
- "Core w/ CoT": round(core_cot_score * 100, 2) if core_cot_score != '-' else '-',
- "Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2) if summary["open"] else '-'
+ "Core": round(summary["core"]["macro_mean_score"] * 100, 2),
+ "Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2)
}
- for display_name in self.SUPER_GROUPS[selected_super_group]:
- original_keyword = self.keyword_display_map[display_name]
- if original_dimension in model_data and original_keyword in model_data[original_dimension]:
- row[display_name] = round(model_data[original_dimension][original_keyword]["average_score"] * 100, 2)
- else:
+
+ # Add dimension-specific scores
+ if original_dimension in model_data:
+ for display_name in self.SUPER_GROUPS[selected_super_group]:
+ original_keyword = self.keyword_display_map[display_name]
+ if original_keyword in model_data[original_dimension]:
+ row[display_name] = round(model_data[original_dimension][original_keyword]["average_score"] * 100, 2)
+ else:
+ row[display_name] = None
+ else:
+ for display_name in self.SUPER_GROUPS[selected_super_group]:
row[display_name] = None
+
data.append(row)
df = pd.DataFrame(data)
@@ -246,8 +234,8 @@ class DefaultDataLoader(BaseDataLoader):
df = self.get_df(selected_super_group, selected_model_group)
# Get total task counts from the first model's data
- sample_model = next(iter(self.MODEL_DATA))
- total_core_tasks = self.SUMMARY_DATA[sample_model]["core_noncot"]["num_eval_tasks"]
+ sample_model = next(iter(self.SUMMARY_DATA))
+ total_core_tasks = self.SUMMARY_DATA[sample_model]["core"]["num_eval_tasks"]
total_open_tasks = self.SUMMARY_DATA[sample_model]["open"]["num_eval_tasks"]
total_tasks = total_core_tasks + total_open_tasks
@@ -255,8 +243,7 @@ class DefaultDataLoader(BaseDataLoader):
column_headers = {
"Models": "Models",
"Overall": f"Overall({total_tasks})",
- "Core w/o CoT": f"Core w/o CoT({total_core_tasks})",
- "Core w/ CoT": f"Core w/ CoT({total_core_tasks})",
+ "Core": f"Core({total_core_tasks})",
"Open-ended": f"Open-ended({total_open_tasks})"
}
@@ -266,93 +253,96 @@ class DefaultDataLoader(BaseDataLoader):
headers = [
column_headers["Models"],
column_headers["Overall"],
- column_headers["Core w/o CoT"],
- column_headers["Core w/ CoT"],
+ column_headers["Core"],
column_headers["Open-ended"]
] + self.SUPER_GROUPS[selected_super_group]
data = df[[
column_headers["Models"],
column_headers["Overall"],
- column_headers["Core w/o CoT"],
- column_headers["Core w/ CoT"],
+ column_headers["Core"],
column_headers["Open-ended"]
] + self.SUPER_GROUPS[selected_super_group]].values.tolist()
return headers, data
-class SingleImageDataLoader(BaseDataLoader):
+class DefaultDataLoader(BaseDataLoader):
def __init__(self):
super().__init__()
def _load_model_data(self) -> Dict[str, Any]:
- with open("./static/eval_results/SI/all_model_keywords_stats.json", "r") as f:
- return json.load(f)
-
- def _load_summary_data(self) -> Dict[str, Any]:
- with open("./static/eval_results/SI/all_summary.json", "r") as f:
- return json.load(f)
-
- def get_df(self, selected_super_group: str, selected_model_group: str) -> pd.DataFrame:
- original_dimension = get_original_dimension(selected_super_group)
- data = []
- for model in self.MODEL_GROUPS[selected_model_group]:
- model_data = self.MODEL_DATA[model]
- summary = self.SUMMARY_DATA[model]
- row = {
- "Models": get_display_model_name(model, as_link=True),
- "Overall": round(summary["overall_score"] * 100, 2),
- "Core": round(summary["core"]["macro_mean_score"] * 100, 2),
- "Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2)
- }
- for display_name in self.SUPER_GROUPS[selected_super_group]:
- original_keyword = self.keyword_display_map[display_name]
- if original_dimension in model_data and original_keyword in model_data[original_dimension]:
- row[display_name] = round(model_data[original_dimension][original_keyword]["average_score"] * 100, 2)
- else:
- row[display_name] = None
- data.append(row)
+ model_data = {}
+ base_path = "./static/eval_results/Default"
- df = pd.DataFrame(data)
- df = df.sort_values(by="Overall", ascending=False)
- return df
+ try:
+ model_folders = [f for f in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, f))]
+ for model_name in model_folders:
+ model_path = f"{base_path}/{model_name}/summary_results.json"
+ with open(model_path, "r") as f:
+ data = json.load(f)
+ if "keyword_stats" in data:
+ model_data[model_name] = data["keyword_stats"]
+ except FileNotFoundError:
+ pass
+
+ return model_data
- def get_leaderboard_data(self, selected_super_group: str, selected_model_group: str) -> Tuple[list, list]:
- df = self.get_df(selected_super_group, selected_model_group)
+ def _load_summary_data(self) -> Dict[str, Any]:
+ summary_data = {}
+ base_path = "./static/eval_results/Default"
- # Get total task counts from the first model's data
- sample_model = next(iter(self.MODEL_DATA))
- total_core_tasks = self.SUMMARY_DATA[sample_model]["core"]["num_eval_tasks"]
- total_open_tasks = self.SUMMARY_DATA[sample_model]["open"]["num_eval_tasks"]
- total_tasks = total_core_tasks + total_open_tasks
+ try:
+ model_folders = [f for f in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, f))]
+ for model_name in model_folders:
+ model_path = f"{base_path}/{model_name}/summary_results.json"
+ with open(model_path, "r") as f:
+ data = json.load(f)
+ if "model_summary" in data:
+ summary_data[model_name] = data["model_summary"]
+ except FileNotFoundError:
+ pass
- # Define headers with task counts
- column_headers = {
- "Models": "Models",
- "Overall": f"Overall({total_tasks})",
- "Core": f"Core({total_core_tasks})",
- "Open-ended": f"Open-ended({total_open_tasks})"
- }
+ return summary_data
+
+
+class SingleImageDataLoader(BaseDataLoader):
+ def __init__(self):
+ super().__init__()
+
+ def _load_model_data(self) -> Dict[str, Any]:
+ model_data = {}
+ base_path = "./static/eval_results/SI"
- # Rename the columns in DataFrame to match headers
- df = df.rename(columns=column_headers)
+ try:
+ model_folders = [f for f in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, f))]
+ for model_name in model_folders:
+ model_path = f"{base_path}/{model_name}/summary_results.json"
+ with open(model_path, "r") as f:
+ data = json.load(f)
+ if "keyword_stats" in data:
+ model_data[model_name] = data["keyword_stats"]
+ except FileNotFoundError:
+ pass
- headers = [
- column_headers["Models"],
- column_headers["Overall"],
- column_headers["Core"],
- column_headers["Open-ended"]
- ] + self.SUPER_GROUPS[selected_super_group]
+ return model_data
+
+ def _load_summary_data(self) -> Dict[str, Any]:
+ summary_data = {}
+ base_path = "./static/eval_results/SI"
- data = df[[
- column_headers["Models"],
- column_headers["Overall"],
- column_headers["Core"],
- column_headers["Open-ended"]
- ] + self.SUPER_GROUPS[selected_super_group]].values.tolist()
+ try:
+ model_folders = [f for f in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, f))]
+ for model_name in model_folders:
+ model_path = f"{base_path}/{model_name}/summary_results.json"
+ with open(model_path, "r") as f:
+ data = json.load(f)
+ if "model_summary" in data:
+ summary_data[model_name] = data["model_summary"]
+ except FileNotFoundError:
+ pass
- return headers, data
+ return summary_data
# Keep your helper functions
@@ -367,3 +357,4 @@ def get_display_model_name(model_name: str, as_link: bool = True) -> str:
if as_link and model_name in MODEL_URLS:
return f'{display_name}'
return display_name
+